Compare commits

...

17 Commits

Author SHA1 Message Date
GitHub Actions
b2eba23e21 chore: release v0.1.15 2025-07-28 05:05:30 +00:00
yichuan520030910320
e9ee687472 nit: fix readme 2025-07-27 21:56:05 -07:00
yichuan520030910320
6f5d5e4a77 fix some readme 2025-07-27 21:50:09 -07:00
Andy Lee
5c8921673a fix: auto-detect normalized embeddings and use cosine distance (#8)
* fix: auto-detect normalized embeddings and use cosine distance

- Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere)
- Automatically set distance_metric='cosine' for normalized embeddings
- Add warnings when using non-optimal distance metrics
- Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2)
- Fix DiskANN zmq_port compatibility with lazy loading strategy
- Add documentation for normalized embeddings feature

This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.

* style: format
2025-07-27 21:19:29 -07:00
yichuan520030910320
e9d2d420bd fix some readme 2025-07-27 20:48:23 -07:00
yichuan520030910320
ebabfad066 Merge branch 'main' of https://github.com/yichuan-w/LEANN 2025-07-27 20:44:36 -07:00
yichuan520030910320
e6f612b5e8 fix install and readme 2025-07-27 20:44:28 -07:00
Andy Lee
51c41acd82 docs: add comprehensive CONTRIBUTING.md guide with pre-commit setup 2025-07-27 20:40:42 -07:00
yichuan520030910320
455f93fb7c fix emaple and add pypi example 2025-07-27 18:20:13 -07:00
yichuan520030910320
48207c3b69 add pypi example 2025-07-27 17:08:49 -07:00
yichuan520030910320
4de1caa40f fix redame install method 2025-07-27 17:00:28 -07:00
yichuan520030910320
60eaa8165c fix precommit and fix redame install method 2025-07-27 16:36:30 -07:00
yichuan520030910320
c1a5d0c624 fix readme 2025-07-27 02:24:28 -07:00
yichuan520030910320
af1790395a fix ruff errors and formatting 2025-07-27 02:22:54 -07:00
yichuan520030910320
383c6d8d7e add clear instructions 2025-07-27 02:19:27 -07:00
yichuan520030910320
bc0d839693 Merge branch 'main' of https://github.com/yichuan-w/LEANN 2025-07-27 02:07:41 -07:00
yichuan520030910320
8596562de5 add pip install option to README 2025-07-27 02:06:40 -07:00
46 changed files with 723 additions and 368 deletions

View File

@@ -8,4 +8,4 @@ on:
jobs: jobs:
build: build:
uses: ./.github/workflows/build-reusable.yml uses: ./.github/workflows/build-reusable.yml

View File

@@ -17,23 +17,23 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
ref: ${{ inputs.ref }} ref: ${{ inputs.ref }}
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: '3.11' python-version: '3.11'
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@v4 uses: astral-sh/setup-uv@v4
- name: Install ruff - name: Install ruff
run: | run: |
uv tool install ruff uv tool install ruff
- name: Run ruff check - name: Run ruff check
run: | run: |
ruff check . ruff check .
- name: Run ruff format check - name: Run ruff format check
run: | run: |
ruff format --check . ruff format --check .
@@ -65,40 +65,40 @@ jobs:
- os: macos-latest - os: macos-latest
python: '3.13' python: '3.13'
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
ref: ${{ inputs.ref }} ref: ${{ inputs.ref }}
submodules: recursive submodules: recursive
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@v4 uses: astral-sh/setup-uv@v4
- name: Install system dependencies (Ubuntu) - name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux' if: runner.os == 'Linux'
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \ sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
# Install Intel MKL for DiskANN # Install Intel MKL for DiskANN
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Install system dependencies (macOS) - name: Install system dependencies (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'
run: | run: |
brew install llvm libomp boost protobuf zeromq brew install llvm libomp boost protobuf zeromq
- name: Install build dependencies - name: Install build dependencies
run: | run: |
uv pip install --system scikit-build-core numpy swig Cython pybind11 uv pip install --system scikit-build-core numpy swig Cython pybind11
@@ -107,7 +107,7 @@ jobs:
else else
uv pip install --system delocate uv pip install --system delocate
fi fi
- name: Build packages - name: Build packages
run: | run: |
# Build core (platform independent) # Build core (platform independent)
@@ -116,7 +116,7 @@ jobs:
uv build uv build
cd ../.. cd ../..
fi fi
# Build HNSW backend # Build HNSW backend
cd packages/leann-backend-hnsw cd packages/leann-backend-hnsw
if [ "${{ matrix.os }}" == "macos-latest" ]; then if [ "${{ matrix.os }}" == "macos-latest" ]; then
@@ -125,7 +125,7 @@ jobs:
uv build --wheel --python python uv build --wheel --python python
fi fi
cd ../.. cd ../..
# Build DiskANN backend # Build DiskANN backend
cd packages/leann-backend-diskann cd packages/leann-backend-diskann
if [ "${{ matrix.os }}" == "macos-latest" ]; then if [ "${{ matrix.os }}" == "macos-latest" ]; then
@@ -134,14 +134,14 @@ jobs:
uv build --wheel --python python uv build --wheel --python python
fi fi
cd ../.. cd ../..
# Build meta package (platform independent) # Build meta package (platform independent)
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
cd packages/leann cd packages/leann
uv build uv build
cd ../.. cd ../..
fi fi
- name: Repair wheels (Linux) - name: Repair wheels (Linux)
if: runner.os == 'Linux' if: runner.os == 'Linux'
run: | run: |
@@ -153,7 +153,7 @@ jobs:
mv dist_repaired dist mv dist_repaired dist
fi fi
cd ../.. cd ../..
# Repair DiskANN wheel # Repair DiskANN wheel
cd packages/leann-backend-diskann cd packages/leann-backend-diskann
if [ -d dist ]; then if [ -d dist ]; then
@@ -162,7 +162,7 @@ jobs:
mv dist_repaired dist mv dist_repaired dist
fi fi
cd ../.. cd ../..
- name: Repair wheels (macOS) - name: Repair wheels (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'
run: | run: |
@@ -174,7 +174,7 @@ jobs:
mv dist_repaired dist mv dist_repaired dist
fi fi
cd ../.. cd ../..
# Repair DiskANN wheel # Repair DiskANN wheel
cd packages/leann-backend-diskann cd packages/leann-backend-diskann
if [ -d dist ]; then if [ -d dist ]; then
@@ -183,14 +183,14 @@ jobs:
mv dist_repaired dist mv dist_repaired dist
fi fi
cd ../.. cd ../..
- name: List built packages - name: List built packages
run: | run: |
echo "📦 Built packages:" echo "📦 Built packages:"
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
- name: Upload artifacts - name: Upload artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: packages-${{ matrix.os }}-py${{ matrix.python }} name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/ path: packages/*/dist/

View File

@@ -16,10 +16,10 @@ jobs:
contents: write contents: write
outputs: outputs:
commit-sha: ${{ steps.push.outputs.commit-sha }} commit-sha: ${{ steps.push.outputs.commit-sha }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Validate version - name: Validate version
run: | run: |
# Remove 'v' prefix if present for validation # Remove 'v' prefix if present for validation
@@ -30,7 +30,7 @@ jobs:
exit 1 exit 1
fi fi
echo "✅ Version format valid: ${{ inputs.version }}" echo "✅ Version format valid: ${{ inputs.version }}"
- name: Update versions and push - name: Update versions and push
id: push id: push
run: | run: |
@@ -38,7 +38,7 @@ jobs:
CURRENT_VERSION=$(grep "^version" packages/leann-core/pyproject.toml | cut -d'"' -f2) CURRENT_VERSION=$(grep "^version" packages/leann-core/pyproject.toml | cut -d'"' -f2)
echo "Current version: $CURRENT_VERSION" echo "Current version: $CURRENT_VERSION"
echo "Target version: ${{ inputs.version }}" echo "Target version: ${{ inputs.version }}"
if [ "$CURRENT_VERSION" = "${{ inputs.version }}" ]; then if [ "$CURRENT_VERSION" = "${{ inputs.version }}" ]; then
echo "⚠️ Version is already ${{ inputs.version }}, skipping update" echo "⚠️ Version is already ${{ inputs.version }}, skipping update"
COMMIT_SHA=$(git rev-parse HEAD) COMMIT_SHA=$(git rev-parse HEAD)
@@ -52,7 +52,7 @@ jobs:
COMMIT_SHA=$(git rev-parse HEAD) COMMIT_SHA=$(git rev-parse HEAD)
echo "✅ Pushed version update: $COMMIT_SHA" echo "✅ Pushed version update: $COMMIT_SHA"
fi fi
echo "commit-sha=$COMMIT_SHA" >> $GITHUB_OUTPUT echo "commit-sha=$COMMIT_SHA" >> $GITHUB_OUTPUT
build-packages: build-packages:
@@ -60,7 +60,7 @@ jobs:
needs: update-version needs: update-version
uses: ./.github/workflows/build-reusable.yml uses: ./.github/workflows/build-reusable.yml
with: with:
ref: 'main' ref: 'main'
publish: publish:
name: Publish and Release name: Publish and Release
@@ -69,26 +69,26 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
contents: write contents: write
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
ref: 'main' ref: 'main'
- name: Download all artifacts - name: Download all artifacts
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:
path: dist-artifacts path: dist-artifacts
- name: Collect packages - name: Collect packages
run: | run: |
mkdir -p dist mkdir -p dist
find dist-artifacts -name "*.whl" -exec cp {} dist/ \; find dist-artifacts -name "*.whl" -exec cp {} dist/ \;
find dist-artifacts -name "*.tar.gz" -exec cp {} dist/ \; find dist-artifacts -name "*.tar.gz" -exec cp {} dist/ \;
echo "📦 Packages to publish:" echo "📦 Packages to publish:"
ls -la dist/ ls -la dist/
- name: Publish to PyPI - name: Publish to PyPI
env: env:
TWINE_USERNAME: __token__ TWINE_USERNAME: __token__
@@ -98,12 +98,12 @@ jobs:
echo "❌ PYPI_API_TOKEN not configured!" echo "❌ PYPI_API_TOKEN not configured!"
exit 1 exit 1
fi fi
pip install twine pip install twine
twine upload dist/* --skip-existing --verbose twine upload dist/* --skip-existing --verbose
echo "✅ Published to PyPI!" echo "✅ Published to PyPI!"
- name: Create release - name: Create release
run: | run: |
# Check if tag already exists # Check if tag already exists
@@ -114,7 +114,7 @@ jobs:
git push origin "v${{ inputs.version }}" git push origin "v${{ inputs.version }}"
echo "✅ Created and pushed tag v${{ inputs.version }}" echo "✅ Created and pushed tag v${{ inputs.version }}"
fi fi
# Check if release already exists # Check if release already exists
if gh release view "v${{ inputs.version }}" >/dev/null 2>&1; then if gh release view "v${{ inputs.version }}" >/dev/null 2>&1; then
echo "⚠️ Release v${{ inputs.version }} already exists, skipping release creation" echo "⚠️ Release v${{ inputs.version }} already exists, skipping release creation"
@@ -126,4 +126,4 @@ jobs:
echo "✅ Created GitHub release v${{ inputs.version }}" echo "✅ Created GitHub release v${{ inputs.version }}"
fi fi
env: env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

4
.gitignore vendored
View File

@@ -9,7 +9,7 @@ demo/indices/
outputs/ outputs/
*.pkl *.pkl
*.pdf *.pdf
*.idx *.idx
*.map *.map
.history/ .history/
lm_eval.egg-info/ lm_eval.egg-info/
@@ -85,4 +85,4 @@ packages/leann-backend-diskann/third_party/DiskANN/_deps/
*.meta.json *.meta.json
*.passages.json *.passages.json
batchtest.py batchtest.py

View File

@@ -9,15 +9,8 @@ repos:
- id: check-merge-conflict - id: check-merge-conflict
- id: debug-statements - id: debug-statements
- repo: https://github.com/psf/black
rev: 24.1.1
hooks:
- id: black
language_version: python3
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1 rev: v0.2.1
hooks: hooks:
- id: ruff - id: ruff
args: [--fix]
- id: ruff-format - id: ruff-format

160
README.md
View File

@@ -33,12 +33,46 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg
🪶 **Lightweight:** Graph-based recomputation eliminates heavy embedding storage, while smart graph pruning and CSR format minimize graph storage overhead. Always less storage, less memory usage! 🪶 **Lightweight:** Graph-based recomputation eliminates heavy embedding storage, while smart graph pruning and CSR format minimize graph storage overhead. Always less storage, less memory usage!
📦 **Portable:** Transfer your entire knowledge base between devices (even with others) with minimal cost - your personal AI memory travels with you.
📈 **Scalability:** Handle messy personal data that would crash traditional vector DBs, easily managing your growing personalized data and agent generated memory! 📈 **Scalability:** Handle messy personal data that would crash traditional vector DBs, easily managing your growing personalized data and agent generated memory!
**No Accuracy Loss:** Maintain the same search quality as heavyweight solutions while using 97% less storage. **No Accuracy Loss:** Maintain the same search quality as heavyweight solutions while using 97% less storage.
## Installation ## Installation
> `pip leann` coming soon!
<details>
<summary><strong>📦 Prerequisites: Install uv (if you don't have it)</strong></summary>
Install uv first if you don't have it:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
📖 [Detailed uv installation methods →](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
</details>
LEANN provides two installation methods: **pip install** (quick and easy) and **build from source** (recommended for development).
### 🚀 Quick Install (Recommended for most users)
Clone the repository to access all examples and install LEANN from [PyPI](https://pypi.org/project/leann/) to run them immediately:
```bash
git clone git@github.com:yichuan-w/LEANN.git leann
cd leann
uv venv
source .venv/bin/activate
uv pip install leann
```
### 🔧 Build from Source (Recommended for development)
```bash ```bash
git clone git@github.com:yichuan-w/LEANN.git leann git clone git@github.com:yichuan-w/LEANN.git leann
cd leann cd leann
@@ -48,27 +82,65 @@ git submodule update --init --recursive
**macOS:** **macOS:**
```bash ```bash
brew install llvm libomp boost protobuf zeromq pkgconf brew install llvm libomp boost protobuf zeromq pkgconf
# Install with HNSW backend (default, recommended for most users)
# Install uv first if you don't have it:
# curl -LsSf https://astral.sh/uv/install.sh | sh
# See: https://docs.astral.sh/uv/getting-started/installation/#installation-methods
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync
``` ```
**Linux:** **Linux:**
```bash ```bash
sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
# Install with HNSW backend (default, recommended for most users)
uv sync uv sync
``` ```
**Ollama Setup (Recommended for full privacy):**
> *You can skip this installation if you only want to use OpenAI API for generation.*
## Quick Star
Our declarative API makes RAG as easy as writing a config file.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb) [Try in this ipynb file →](demo.ipynb)
```python
from leann import LeannBuilder, LeannSearcher, LeannCha
from pathlib import Path
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
# Build an index
builder = LeannBuilder(backend_name="hnsw")
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
builder.add_text("Tung Tung Tung Sahur called—they need their bananacrocodile hybrid back")
builder.build_index(INDEX_PATH)
# Search
searcher = LeannSearcher(INDEX_PATH)
results = searcher.search("fantastical AI-generated creatures", top_k=1)
# Chat with your data
chat = LeannChat(INDEX_PATH, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"})
response = chat.ask("How much storage does LEANN save?", top_k=1)
```
## RAG on Everything!
LEANN supports RAG on various data sources including documents (.pdf, .txt, .md), Apple Mail, Google Search History, WeChat, and more.
> **Generation Model Setup**
> LEANN supports multiple LLM providers for text generation (OpenAI API, HuggingFace, Ollama).
<details>
<summary><strong>🔑 OpenAI API Setup (Default)</strong></summary>
Set your OpenAI API key as an environment variable:
```bash
export OPENAI_API_KEY="your-api-key-here"
```
</details>
<details>
<summary><strong>🔧 Ollama Setup (Recommended for full privacy)</strong></summary>
**macOS:** **macOS:**
@@ -80,6 +152,7 @@ ollama pull llama3.2:1b
``` ```
**Linux:** **Linux:**
```bash ```bash
# Install Ollama # Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh curl -fsSL https://ollama.ai/install.sh | sh
@@ -91,43 +164,7 @@ ollama serve &
ollama pull llama3.2:1b ollama pull llama3.2:1b
``` ```
## Quick Start in 30s </details>
Our declarative API makes RAG as easy as writing a config file.
[Try in this ipynb file →](demo.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yichuan-w/LEANN/blob/main/demo.ipynb)
```python
from leann.api import LeannBuilder, LeannSearcher, LeannChat
# 1. Build the index (no embeddings stored!)
builder = LeannBuilder(backend_name="hnsw")
builder.add_text("C# is a powerful programming language")
builder.add_text("Python is a powerful programming language and it is very popular")
builder.add_text("Machine learning transforms industries")
builder.add_text("Neural networks process complex data")
builder.add_text("Leann is a great storage saving engine for RAG on your MacBook")
builder.build_index("knowledge.leann")
# 2. Search with real-time embeddings
searcher = LeannSearcher("knowledge.leann")
results = searcher.search("programming languages", top_k=2)
# 3. Chat with LEANN using retrieved results
llm_config = {
"type": "ollama",
"model": "llama3.2:1b"
}
chat = LeannChat(index_path="knowledge.leann", llm_config=llm_config)
response = chat.ask(
"Compare the two retrieved programming languages and say which one is more popular today.",
top_k=2,
)
```
## RAG on Everything!
LEANN supports RAG on various data sources including documents (.pdf, .txt, .md), Apple Mail, Google Search History, WeChat, and more.
### 📄 Personal Data Manager: Process Any Documents (.pdf, .txt, .md)! ### 📄 Personal Data Manager: Process Any Documents (.pdf, .txt, .md)!
@@ -139,11 +176,6 @@ Ask questions directly about your personal PDFs, documents, and any directory co
The example below asks a question about summarizing two papers (uses default data in `examples/data`): The example below asks a question about summarizing two papers (uses default data in `examples/data`):
```bash
# Drop your PDFs, .txt, .md files into examples/data/
uv run ./examples/main_cli_example.py
```
``` ```
# Or use python directly # Or use python directly
source .venv/bin/activate source .venv/bin/activate
@@ -154,6 +186,9 @@ python ./examples/main_cli_example.py
### 📧 Your Personal Email Secretary: RAG on Apple Mail! ### 📧 Your Personal Email Secretary: RAG on Apple Mail!
> **Note:** The examples below currently support macOS only. Windows support coming soon.
<p align="center"> <p align="center">
<img src="videos/mail_clear.gif" alt="LEANN Email Search Demo" width="600"> <img src="videos/mail_clear.gif" alt="LEANN Email Search Demo" width="600">
</p> </p>
@@ -233,7 +268,7 @@ The default Chrome profile path is configured for a typical macOS setup. If you
1. Open Terminal 1. Open Terminal
2. Run: `ls ~/Library/Application\ Support/Google/Chrome/` 2. Run: `ls ~/Library/Application\ Support/Google/Chrome/`
3. Look for folders like "Default", "Profile 1", "Profile 2", etc. 3. Look for folders like "Default", "Profile 1", "Profile 2", etc.
4. Use the full path as your `--chrome-profile` argument 4. Use the full path as your `--chrome-profile` argumen
**Common Chrome profile locations:** **Common Chrome profile locations:**
- macOS: `~/Library/Application Support/Google/Chrome/Default` - macOS: `~/Library/Application Support/Google/Chrome/Default`
@@ -276,7 +311,7 @@ sudo packages/wechat-exporter/wechattweak-cli install
**Troubleshooting:** **Troubleshooting:**
- **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41) - **Installation issues**: Check the [WeChatTweak-CLI issues page](https://github.com/sunnyyoung/WeChatTweak-CLI/issues/41)
- **Export errors**: If you encounter the error below, try restarting WeChat - **Export errors**: If you encounter the error below, try restarting WeCha
``` ```
Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed. Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.
Failed to find or export WeChat data. Exiting. Failed to find or export WeChat data. Exiting.
@@ -324,14 +359,14 @@ LEANN includes a powerful CLI for document processing and search. Perfect for qu
# Build an index from documents # Build an index from documents
leann build my-docs --docs ./documents leann build my-docs --docs ./documents
# Search your documents # Search your documents
leann search my-docs "machine learning concepts" leann search my-docs "machine learning concepts"
# Interactive chat with your documents # Interactive chat with your documents
leann ask my-docs --interactive leann ask my-docs --interactive
# List all your indexes # List all your indexes
leann list leann lis
``` ```
**Key CLI features:** **Key CLI features:**
@@ -392,7 +427,7 @@ Options:
**Core techniques:** **Core techniques:**
- **Graph-based selective recomputation:** Only compute embeddings for nodes in the search path - **Graph-based selective recomputation:** Only compute embeddings for nodes in the search path
- **High-degree preserving pruning:** Keep important "hub" nodes while removing redundant connections - **High-degree preserving pruning:** Keep important "hub" nodes while removing redundant connections
- **Dynamic batching:** Efficiently batch embedding computations for GPU utilization - **Dynamic batching:** Efficiently batch embedding computations for GPU utilization
- **Two-level search:** Smart graph traversal that prioritizes promising nodes - **Two-level search:** Smart graph traversal that prioritizes promising nodes
@@ -416,7 +451,7 @@ Options:
```bash ```bash
uv pip install -e ".[dev]" # Install dev dependencies uv pip install -e ".[dev]" # Install dev dependencies
python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR dataset python examples/run_evaluation.py data/indices/dpr/dpr_diskann # DPR datase
python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia python examples/run_evaluation.py data/indices/rpj_wiki/rpj_wiki.index # Wikipedia
``` ```
@@ -429,22 +464,22 @@ If you find Leann useful, please cite:
```bibtex ```bibtex
@misc{wang2025leannlowstoragevectorindex, @misc{wang2025leannlowstoragevectorindex,
title={LEANN: A Low-Storage Vector Index}, title={LEANN: A Low-Storage Vector Index},
author={Yichuan Wang and Shu Liu and Zhifei Li and Yongji Wu and Ziming Mao and Yilong Zhao and Xiao Yan and Zhiying Xu and Yang Zhou and Ion Stoica and Sewon Min and Matei Zaharia and Joseph E. Gonzalez}, author={Yichuan Wang and Shu Liu and Zhifei Li and Yongji Wu and Ziming Mao and Yilong Zhao and Xiao Yan and Zhiying Xu and Yang Zhou and Ion Stoica and Sewon Min and Matei Zaharia and Joseph E. Gonzalez},
year={2025}, year={2025},
eprint={2506.08276}, eprint={2506.08276},
archivePrefix={arXiv}, archivePrefix={arXiv},
primaryClass={cs.DB}, primaryClass={cs.DB},
url={https://arxiv.org/abs/2506.08276}, url={https://arxiv.org/abs/2506.08276},
} }
``` ```
## ✨ [Detailed Features →](docs/features.md) ## ✨ [Detailed Features →](docs/features.md)
## 🤝 [Contributing →](docs/contributing.md) ## 🤝 [CONTRIBUTING →](docs/CONTRIBUTING.md)
## [FAQ →](docs/faq.md) ## [FAQ →](docs/faq.md)
## 📈 [Roadmap →](docs/roadmap.md) ## 📈 [Roadmap →](docs/roadmap.md)
@@ -465,4 +500,3 @@ This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.e
<p align="center"> <p align="center">
Made with ❤️ by the Leann team Made with ❤️ by the Leann team
</p> </p>

View File

@@ -4,7 +4,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Quick Start in 30s\n", "# Quick Start \n",
"\n", "\n",
"**Home GitHub Repository:** [LEANN on GitHub](https://github.com/yichuan-w/LEANN)\n", "**Home GitHub Repository:** [LEANN on GitHub](https://github.com/yichuan-w/LEANN)\n",
"\n", "\n",
@@ -49,68 +49,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"Writing passages: 100%|██████████| 5/5 [00:00<00:00, 17077.79chunk/s]\n",
"Batches: 100%|██████████| 1/1 [00:00<00:00, 36.43it/s]\n",
"WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n",
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"M: 64 for level: 0\n",
"Starting conversion: index.index -> index.csr.tmp\n",
"[0.00s] Reading Index HNSW header...\n",
"[0.00s] Header read: d=768, ntotal=5\n",
"[0.00s] Reading HNSW struct vectors...\n",
" Reading vector (dtype=<class 'numpy.float64'>, fmt='d')... Count=6, Bytes=48\n",
"[0.00s] Read assign_probas (6)\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=7, Bytes=28\n",
"[0.14s] Read cum_nneighbor_per_level (7)\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=5, Bytes=20\n",
"[0.24s] Read levels (5)\n",
"[0.33s] Probing for compact storage flag...\n",
"[0.33s] Found compact flag: False\n",
"[0.33s] Compact flag is False, reading original format...\n",
"[0.33s] Probing for potential extra byte before non-compact offsets...\n",
"[0.33s] Found and consumed an unexpected 0x00 byte.\n",
" Reading vector (dtype=<class 'numpy.uint64'>, fmt='Q')... Count=6, Bytes=48\n",
"[0.33s] Read offsets (6)\n",
"[0.41s] Attempting to read neighbors vector...\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=320, Bytes=1280\n",
"[0.41s] Read neighbors (320)\n",
"[0.54s] Read scalar params (ep=4, max_lvl=0)\n",
"[0.54s] Checking for storage data...\n",
"[0.54s] Found storage fourcc: 49467849.\n",
"[0.54s] Converting to CSR format...\n",
"[0.54s] Conversion loop finished. \n",
"[0.54s] Running validation checks...\n",
" Checking total valid neighbor count...\n",
" OK: Total valid neighbors = 20\n",
" Checking final pointer indices...\n",
" OK: Final pointers match data size.\n",
"[0.54s] Deleting original neighbors and offsets arrays...\n",
" CSR Stats: |data|=20, |level_ptr|=10\n",
"[0.63s] Writing CSR HNSW graph data in FAISS-compatible order...\n",
" Pruning embeddings: Writing NULL storage marker.\n",
"[0.71s] Conversion complete.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n",
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'index.index'\n"
]
}
],
"source": [ "source": [
"from leann.api import LeannBuilder\n", "from leann.api import LeannBuilder\n",
"\n", "\n",
@@ -136,81 +75,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.api:🔍 LeannSearcher.search() called:\n",
"INFO:leann.api: Query: 'programming languages'\n",
"INFO:leann.api: Top_k: 2\n",
"INFO:leann.api: Additional kwargs: {}\n",
"INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5560 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5561 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5562 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Starting embedding server on port 5563...\n",
"INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5563 --model-name facebook/contriever --passages-file /Users/yichuan/Desktop/code/test_leann_pip/LEANN/content/index.meta.json\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"INFO:leann.embedding_server_manager:Server process started with PID: 31699\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
"[read_HNSW NL v4] Read levels vector, size: 5\n",
"[read_HNSW NL v4] Reading Compact Storage format indices...\n",
"[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
"[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
"[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
"[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
"[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
"[read_HNSW NL v4] Reading neighbors data into memory.\n",
"[read_HNSW NL v4] Read neighbors data, size: 20\n",
"[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
"INFO: Skipping external storage loading, since is_recompute is true.\n",
"INFO: Registering backend 'hnsw'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Traceback (most recent call last):\n",
" File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 323, in <module>\n",
" create_hnsw_embedding_server(\n",
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 98, in create_hnsw_embedding_server\n",
" passages = PassageManager(passage_sources)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py\", line 127, in __init__\n",
" raise FileNotFoundError(f\"Passage index file not found: {index_file}\")\n",
"FileNotFoundError: Passage index file not found: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/index.passages.idx\n",
"ERROR:leann.embedding_server_manager:Server terminated during startup.\n"
]
},
{
"ename": "RuntimeError",
"evalue": "Failed to start embedding server on port 5563",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mleann\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mapi\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LeannSearcher\n\u001b[32m 3\u001b[39m searcher = LeannSearcher(\u001b[33m\"\u001b[39m\u001b[33mindex\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m results = \u001b[43msearcher\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprogramming languages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m results\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py:439\u001b[39m, in \u001b[36mLeannSearcher.search\u001b[39m\u001b[34m(self, query, top_k, complexity, beam_width, prune_ratio, recompute_embeddings, pruning_strategy, expected_zmq_port, **kwargs)\u001b[39m\n\u001b[32m 437\u001b[39m start_time = time.time()\n\u001b[32m 438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recompute_embeddings:\n\u001b[32m--> \u001b[39m\u001b[32m439\u001b[39m zmq_port = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbackend_impl\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_ensure_server_running\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 440\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmeta_path_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 441\u001b[39m \u001b[43m \u001b[49m\u001b[43mport\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_zmq_port\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 442\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 443\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 444\u001b[39m \u001b[38;5;28;01mdel\u001b[39;00m expected_zmq_port\n\u001b[32m 445\u001b[39m zmq_time = time.time() - start_time\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/searcher_base.py:81\u001b[39m, in \u001b[36mBaseSearcher._ensure_server_running\u001b[39m\u001b[34m(self, passages_source_file, port, **kwargs)\u001b[39m\n\u001b[32m 72\u001b[39m server_started, actual_port = \u001b[38;5;28mself\u001b[39m.embedding_server_manager.start_server(\n\u001b[32m 73\u001b[39m port=port,\n\u001b[32m 74\u001b[39m model_name=\u001b[38;5;28mself\u001b[39m.embedding_model,\n\u001b[32m (...)\u001b[39m\u001b[32m 78\u001b[39m enable_warmup=kwargs.get(\u001b[33m\"\u001b[39m\u001b[33menable_warmup\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[32m 79\u001b[39m )\n\u001b[32m 80\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m server_started:\n\u001b[32m---> \u001b[39m\u001b[32m81\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 82\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to start embedding server on port \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mactual_port\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 83\u001b[39m )\n\u001b[32m 85\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m actual_port\n",
"\u001b[31mRuntimeError\u001b[39m: Failed to start embedding server on port 5563"
]
}
],
"source": [ "source": [
"from leann.api import LeannSearcher\n", "from leann.api import LeannSearcher\n",
"\n", "\n",

220
docs/CONTRIBUTING.md Normal file
View File

@@ -0,0 +1,220 @@
# 🤝 Contributing
We welcome contributions! Leann is built by the community, for the community.
## Ways to Contribute
- 🐛 **Bug Reports**: Found an issue? Let us know!
- 💡 **Feature Requests**: Have an idea? We'd love to hear it!
- 🔧 **Code Contributions**: PRs welcome for all skill levels
- 📖 **Documentation**: Help make Leann more accessible
- 🧪 **Benchmarks**: Share your performance results
## 🚀 Development Setup
### Prerequisites
1. **Install uv** (fast Python package installer):
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
2. **Clone the repository**:
```bash
git clone https://github.com/LEANN-RAG/LEANN-RAG.git
cd LEANN-RAG
```
3. **Install system dependencies**:
**macOS:**
```bash
brew install llvm libomp boost protobuf zeromq pkgconf
```
**Ubuntu/Debian:**
```bash
sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler \
libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
```
4. **Build from source**:
```bash
# macOS
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync
# Ubuntu/Debian
uv sync
```
## 🔨 Pre-commit Hooks
We use pre-commit hooks to ensure code quality and consistency. This runs automatically before each commit.
### Setup Pre-commit
1. **Install pre-commit** (already included when you run `uv sync`):
```bash
uv pip install pre-commit
```
2. **Install the git hooks**:
```bash
pre-commit install
```
3. **Run pre-commit manually** (optional):
```bash
pre-commit run --all-files
```
### Pre-commit Checks
Our pre-commit configuration includes:
- **Trailing whitespace removal**
- **End-of-file fixing**
- **YAML validation**
- **Large file prevention**
- **Merge conflict detection**
- **Debug statement detection**
- **Code formatting with ruff**
- **Code linting with ruff**
## 🧪 Testing
### Running Tests
```bash
# Run all tests
uv run pytest
# Run specific test file
uv run pytest test/test_filename.py
# Run with coverage
uv run pytest --cov=leann
```
### Writing Tests
- Place tests in the `test/` directory
- Follow the naming convention `test_*.py`
- Use descriptive test names that explain what's being tested
- Include both positive and negative test cases
## 📝 Code Style
We use `ruff` for both linting and formatting to ensure consistent code style.
### Format Your Code
```bash
# Format all files
ruff format
# Check formatting without changing files
ruff format --check
```
### Lint Your Code
```bash
# Run linter with auto-fix
ruff check --fix
# Just check without fixing
ruff check
```
### Style Guidelines
- Follow PEP 8 conventions
- Use descriptive variable names
- Add type hints where appropriate
- Write docstrings for all public functions and classes
- Keep functions focused and single-purpose
## 🚦 CI/CD
Our CI pipeline runs automatically on all pull requests. It includes:
1. **Linting and Formatting**: Ensures code follows our style guidelines
2. **Multi-platform builds**: Tests on Ubuntu and macOS
3. **Python version matrix**: Tests on Python 3.9-3.13
4. **Wheel building**: Ensures packages can be built and distributed
### CI Commands
The CI uses the same commands as pre-commit to ensure consistency:
```bash
# Linting
ruff check .
# Format checking
ruff format --check .
```
Make sure your code passes these checks locally before pushing!
## 🔄 Pull Request Process
1. **Fork the repository** and create your branch from `main`:
```bash
git checkout -b feature/your-feature-name
```
2. **Make your changes**:
- Write clean, documented code
- Add tests for new functionality
- Update documentation as needed
3. **Run pre-commit checks**:
```bash
pre-commit run --all-files
```
4. **Test your changes**:
```bash
uv run pytest
```
5. **Commit with descriptive messages**:
```bash
git commit -m "feat: add new search algorithm"
```
Follow [Conventional Commits](https://www.conventionalcommits.org/):
- `feat:` for new features
- `fix:` for bug fixes
- `docs:` for documentation changes
- `test:` for test additions/changes
- `refactor:` for code refactoring
- `perf:` for performance improvements
6. **Push and create a pull request**:
- Provide a clear description of your changes
- Reference any related issues
- Include examples or screenshots if applicable
## 📚 Documentation
When adding new features or making significant changes:
1. Update relevant documentation in `/docs`
2. Add docstrings to new functions/classes
3. Update README.md if needed
4. Include usage examples
## 🤔 Getting Help
- **Discord**: Join our community for discussions
- **Issues**: Check existing issues or create a new one
- **Discussions**: For general questions and ideas
## 📄 License
By contributing, you agree that your contributions will be licensed under the same license as the project (MIT).
---
Thank you for contributing to LEANN! Every contribution, no matter how small, helps make the project better for everyone. 🌟

View File

@@ -19,4 +19,4 @@ That's it! The workflow will automatically:
- ✅ Publish to PyPI - ✅ Publish to PyPI
- ✅ Create GitHub tag and release - ✅ Create GitHub tag and release
Check progress: https://github.com/yichuan-w/LEANN/actions Check progress: https://github.com/yichuan-w/LEANN/actions

View File

@@ -1,11 +0,0 @@
# 🤝 Contributing
We welcome contributions! Leann is built by the community, for the community.
## Ways to Contribute
- 🐛 **Bug Reports**: Found an issue? Let us know!
- 💡 **Feature Requests**: Have an idea? We'd love to hear it!
- 🔧 **Code Contributions**: PRs welcome for all skill levels
- 📖 **Documentation**: Help make Leann more accessible
- 🧪 **Benchmarks**: Share your performance results

View File

@@ -7,4 +7,4 @@ You can speed up the process by using a lightweight embedding model. Add this to
```bash ```bash
--embedding-model sentence-transformers/all-MiniLM-L6-v2 --embedding-model sentence-transformers/all-MiniLM-L6-v2
``` ```
**Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters) **Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters)

View File

@@ -19,4 +19,4 @@
- **Simple Python API** - Get started in minutes - **Simple Python API** - Get started in minutes
- **Extensible backend system** - Easy to add new algorithms - **Extensible backend system** - Easy to add new algorithms
- **Comprehensive examples** - From basic usage to production deployment - **Comprehensive examples** - From basic usage to production deployment

View File

@@ -0,0 +1,75 @@
# Normalized Embeddings Support in LEANN
LEANN now automatically detects normalized embedding models and sets the appropriate distance metric for optimal performance.
## What are Normalized Embeddings?
Normalized embeddings are vectors with L2 norm = 1 (unit vectors). These embeddings are optimized for cosine similarity rather than Maximum Inner Product Search (MIPS).
## Automatic Detection
When you create a `LeannBuilder` instance with a normalized embedding model, LEANN will:
1. **Automatically set `distance_metric="cosine"`** if not specified
2. **Show a warning** if you manually specify a different distance metric
3. **Provide optimal search performance** with the correct metric
## Supported Normalized Embedding Models
### OpenAI
All OpenAI text embedding models are normalized:
- `text-embedding-ada-002`
- `text-embedding-3-small`
- `text-embedding-3-large`
### Voyage AI
All Voyage AI embedding models are normalized:
- `voyage-2`
- `voyage-3`
- `voyage-large-2`
- `voyage-multilingual-2`
- `voyage-code-2`
### Cohere
All Cohere embedding models are normalized:
- `embed-english-v3.0`
- `embed-multilingual-v3.0`
- `embed-english-light-v3.0`
- `embed-multilingual-light-v3.0`
## Example Usage
```python
from leann.api import LeannBuilder
# Automatic detection - will use cosine distance
builder = LeannBuilder(
backend_name="hnsw",
embedding_model="text-embedding-3-small",
embedding_mode="openai"
)
# Warning: Detected normalized embeddings model 'text-embedding-3-small'...
# Automatically setting distance_metric='cosine'
# Manual override (not recommended)
builder = LeannBuilder(
backend_name="hnsw",
embedding_model="text-embedding-3-small",
embedding_mode="openai",
distance_metric="mips" # Will show warning
)
# Warning: Using 'mips' distance metric with normalized embeddings...
```
## Non-Normalized Embeddings
Models like `facebook/contriever` and other sentence-transformers models that are not normalized will continue to use MIPS by default, which is optimal for them.
## Why This Matters
Using the wrong distance metric with normalized embeddings can lead to:
- **Poor search quality** due to HNSW's early termination with narrow score ranges
- **Incorrect ranking** of search results
- **Suboptimal performance** compared to using the correct metric
For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py).

View File

@@ -18,4 +18,4 @@
- [ ] Integration with LangChain/LlamaIndex - [ ] Integration with LangChain/LlamaIndex
- [ ] Visual similarity search - [ ] Visual similarity search
- [ ] Query rewrtiting, rerank and expansion - [ ] Query rewrtiting, rerank and expansion

View File

@@ -1,5 +1,5 @@
The Project Gutenberg eBook of Pride and Prejudice The Project Gutenberg eBook of Pride and Prejudice
This ebook is for the use of anyone anywhere in the United States and This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms whatsoever. You may copy it, give it away or re-use it under the terms
@@ -14557,7 +14557,7 @@ her into Derbyshire, had been the means of uniting them.
*** END OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE *** *** END OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***
Updated editions will replace the previous one—the old editions will Updated editions will replace the previous one—the old editions will
be renamed. be renamed.
@@ -14662,7 +14662,7 @@ performed, viewed, copied or distributed:
at www.gutenberg.org. If you at www.gutenberg.org. If you
are not located in the United States, you will have to check the laws are not located in the United States, you will have to check the laws
of the country where you are located before using this eBook. of the country where you are located before using this eBook.
1.E.2. If an individual Project Gutenberg™ electronic work is 1.E.2. If an individual Project Gutenberg™ electronic work is
derived from texts not protected by U.S. copyright law (does not derived from texts not protected by U.S. copyright law (does not
contain a notice indicating that it is posted with permission of the contain a notice indicating that it is posted with permission of the
@@ -14724,7 +14724,7 @@ provided that:
Gutenberg Literary Archive Foundation at the address specified in Gutenberg Literary Archive Foundation at the address specified in
Section 4, “Information about donations to the Project Gutenberg Section 4, “Information about donations to the Project Gutenberg
Literary Archive Foundation.” Literary Archive Foundation.”
• You provide a full refund of any money paid by a user who notifies • You provide a full refund of any money paid by a user who notifies
you in writing (or by e-mail) within 30 days of receipt that s/he you in writing (or by e-mail) within 30 days of receipt that s/he
does not agree to the terms of the full Project Gutenberg™ does not agree to the terms of the full Project Gutenberg™
@@ -14732,15 +14732,15 @@ provided that:
copies of the works possessed in a physical medium and discontinue copies of the works possessed in a physical medium and discontinue
all use of and all access to other copies of Project Gutenberg™ all use of and all access to other copies of Project Gutenberg™
works. works.
• You provide, in accordance with paragraph 1.F.3, a full refund of • You provide, in accordance with paragraph 1.F.3, a full refund of
any money paid for a work or a replacement copy, if a defect in the any money paid for a work or a replacement copy, if a defect in the
electronic work is discovered and reported to you within 90 days of electronic work is discovered and reported to you within 90 days of
receipt of the work. receipt of the work.
• You comply with all other terms of this agreement for free • You comply with all other terms of this agreement for free
distribution of Project Gutenberg™ works. distribution of Project Gutenberg™ works.
1.E.9. If you wish to charge a fee or distribute a Project 1.E.9. If you wish to charge a fee or distribute a Project
Gutenberg™ electronic work or group of works on different terms than Gutenberg™ electronic work or group of works on different terms than
@@ -14903,5 +14903,3 @@ This website includes information about Project Gutenberg™,
including how to make donations to the Project Gutenberg Literary including how to make donations to the Project Gutenberg Literary
Archive Foundation, how to help produce our new eBooks, and how to Archive Foundation, how to help produce our new eBooks, and how to
subscribe to our email newsletter to hear about new eBooks. subscribe to our email newsletter to hear about new eBooks.

View File

@@ -27,7 +27,10 @@ def load_sample_documents():
"title": "Intro to Python", "title": "Intro to Python",
"content": "Python is a high-level, interpreted language known for simplicity.", "content": "Python is a high-level, interpreted language known for simplicity.",
}, },
{"title": "ML Basics", "content": "Machine learning builds systems that learn from data."}, {
"title": "ML Basics",
"content": "Machine learning builds systems that learn from data.",
},
{ {
"title": "Data Structures", "title": "Data Structures",
"content": "Data structures like arrays, lists, and graphs organize data.", "content": "Data structures like arrays, lists, and graphs organize data.",

View File

@@ -21,7 +21,9 @@ DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Googl
def create_leann_index_from_multiple_chrome_profiles( def create_leann_index_from_multiple_chrome_profiles(
profile_dirs: list[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1 profile_dirs: list[Path],
index_path: str = "chrome_history_index.leann",
max_count: int = -1,
): ):
""" """
Create LEANN index from multiple Chrome profile data sources. Create LEANN index from multiple Chrome profile data sources.

View File

@@ -474,7 +474,8 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
message_group, contact_name message_group, contact_name
) )
doc = Document( doc = Document(
text=doc_content, metadata={"contact_name": contact_name} text=doc_content,
metadata={"contact_name": contact_name},
) )
docs.append(doc) docs.append(doc)
count += 1 count += 1

View File

@@ -315,7 +315,11 @@ async def main():
# Create or load the LEANN index from all sources # Create or load the LEANN index from all sources
index_path = create_leann_index_from_multiple_sources( index_path = create_leann_index_from_multiple_sources(
messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model messages_dirs,
INDEX_PATH,
args.max_emails,
args.include_html,
args.embedding_model,
) )
if index_path: if index_path:

View File

@@ -92,7 +92,10 @@ def main():
help="Directory to store the index (default: mail_index_embedded)", help="Directory to store the index (default: mail_index_embedded)",
) )
parser.add_argument( parser.add_argument(
"--max-emails", type=int, default=10000, help="Maximum number of emails to process" "--max-emails",
type=int,
default=10000,
help="Maximum number of emails to process",
) )
parser.add_argument( parser.add_argument(
"--include-html", "--include-html",
@@ -112,7 +115,10 @@ def main():
else: else:
print("Creating new index...") print("Creating new index...")
index = create_and_save_index( index = create_and_save_index(
mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html mail_path,
save_dir,
max_count=args.max_emails,
include_html=args.include_html,
) )
if index: if index:
queries = [ queries = [

View File

@@ -30,17 +30,22 @@ async def main(args):
all_texts = [] all_texts = []
for doc in documents: for doc in documents:
nodes = node_parser.get_nodes_from_documents([doc]) nodes = node_parser.get_nodes_from_documents([doc])
for node in nodes: if nodes:
all_texts.append(node.get_content()) all_texts.extend(node.get_content() for node in nodes)
print("--- Index directory not found, building new index ---") print("--- Index directory not found, building new index ---")
print("\n[PHASE 1] Building Leann index...") print("\n[PHASE 1] Building Leann index...")
# LeannBuilder now automatically detects normalized embeddings and sets appropriate distance metric
print(f"Using {args.embedding_model} with {args.embedding_mode} mode")
# Use HNSW backend for better macOS compatibility # Use HNSW backend for better macOS compatibility
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name="hnsw",
embedding_model="facebook/contriever", embedding_model=args.embedding_model,
embedding_mode=args.embedding_mode,
# distance_metric is automatically set based on embedding model
graph_degree=32, graph_degree=32,
complexity=64, complexity=64,
is_compact=True, is_compact=True,
@@ -89,6 +94,19 @@ if __name__ == "__main__":
default="Qwen/Qwen3-0.6B", default="Qwen/Qwen3-0.6B",
help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).", help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
) )
parser.add_argument(
"--embedding-model",
type=str,
default="facebook/contriever",
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small').",
)
parser.add_argument(
"--embedding-mode",
type=str,
default="sentence-transformers",
choices=["sentence-transformers", "openai", "mlx"],
help="The embedding backend mode.",
)
parser.add_argument( parser.add_argument(
"--host", "--host",
type=str, type=str,

View File

@@ -347,7 +347,9 @@ def demo_aggregation():
print(f"\n{'=' * 20} {method.upper()} AGGREGATION {'=' * 20}") print(f"\n{'=' * 20} {method.upper()} AGGREGATION {'=' * 20}")
aggregator = MultiVectorAggregator( aggregator = MultiVectorAggregator(
aggregation_method=method, spatial_clustering=True, cluster_distance_threshold=100.0 aggregation_method=method,
spatial_clustering=True,
cluster_distance_threshold=100.0,
) )
aggregated = aggregator.aggregate_results(mock_results, top_k=5) aggregated = aggregator.aggregate_results(mock_results, top_k=5)

View File

@@ -1 +0,0 @@

View File

@@ -163,18 +163,44 @@ class DiskannSearcher(BaseSearcher):
self.num_threads = kwargs.get("num_threads", 8) self.num_threads = kwargs.get("num_threads", 8)
fake_zmq_port = 6666 # For DiskANN, we need to reinitialize the index when zmq_port changes
# Store the initialization parameters for later use
full_index_prefix = str(self.index_dir / self.index_path.stem) full_index_prefix = str(self.index_dir / self.index_path.stem)
self._index = diskannpy.StaticDiskFloatIndex( self._init_params = {
metric_enum, "metric_enum": metric_enum,
full_index_prefix, "full_index_prefix": full_index_prefix,
self.num_threads, "num_threads": self.num_threads,
kwargs.get("num_nodes_to_cache", 0), "num_nodes_to_cache": kwargs.get("num_nodes_to_cache", 0),
1, "cache_mechanism": 1,
fake_zmq_port, # Initial port, can be updated at runtime "pq_prefix": "",
"", "partition_prefix": "",
"", }
) self._diskannpy = diskannpy
self._current_zmq_port = None
self._index = None
logger.debug("DiskANN searcher initialized (index will be loaded on first search)")
def _ensure_index_loaded(self, zmq_port: int):
"""Ensure the index is loaded with the correct zmq_port."""
if self._index is None or self._current_zmq_port != zmq_port:
# Need to (re)load the index with the correct zmq_port
with suppress_cpp_output_if_needed():
if self._index is not None:
logger.debug(f"Reloading DiskANN index with new zmq_port: {zmq_port}")
else:
logger.debug(f"Loading DiskANN index with zmq_port: {zmq_port}")
self._index = self._diskannpy.StaticDiskFloatIndex(
self._init_params["metric_enum"],
self._init_params["full_index_prefix"],
self._init_params["num_threads"],
self._init_params["num_nodes_to_cache"],
self._init_params["cache_mechanism"],
zmq_port,
self._init_params["pq_prefix"],
self._init_params["partition_prefix"],
)
self._current_zmq_port = zmq_port
def search( def search(
self, self,
@@ -212,14 +238,15 @@ class DiskannSearcher(BaseSearcher):
Returns: Returns:
Dict with 'labels' (list of lists) and 'distances' (ndarray) Dict with 'labels' (list of lists) and 'distances' (ndarray)
""" """
# Handle zmq_port compatibility: DiskANN can now update port at runtime # Handle zmq_port compatibility: Ensure index is loaded with correct port
if recompute_embeddings: if recompute_embeddings:
if zmq_port is None: if zmq_port is None:
raise ValueError("zmq_port must be provided if recompute_embeddings is True") raise ValueError("zmq_port must be provided if recompute_embeddings is True")
current_port = self._index.get_zmq_port() self._ensure_index_loaded(zmq_port)
if zmq_port != current_port: else:
logger.debug(f"Updating DiskANN zmq_port from {current_port} to {zmq_port}") # If not recomputing, we still need an index, use a default port
self._index.set_zmq_port(zmq_port) if self._index is None:
self._ensure_index_loaded(6666) # Default port when not recomputing
# DiskANN doesn't support "proportional" strategy # DiskANN doesn't support "proportional" strategy
if pruning_strategy == "proportional": if pruning_strategy == "proportional":

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.1.14" version = "0.1.15"
dependencies = ["leann-core==0.1.14", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.1.15", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path
@@ -16,4 +16,4 @@ wheel.packages = ["leann_backend_diskann"]
editable.mode = "redirect" editable.mode = "redirect"
cmake.build-type = "Release" cmake.build-type = "Release"
build.verbose = true build.verbose = true
build.tool-args = ["-j8"] build.tool-args = ["-j8"]

View File

@@ -2,12 +2,12 @@ syntax = "proto3";
package protoembedding; package protoembedding;
message NodeEmbeddingRequest { message NodeEmbeddingRequest {
repeated uint32 node_ids = 1; repeated uint32 node_ids = 1;
} }
message NodeEmbeddingResponse { message NodeEmbeddingResponse {
bytes embeddings_data = 1; // All embedded binary datas bytes embeddings_data = 1; // All embedded binary datas
repeated int32 dimensions = 2; // Shape [batch_size, embedding_dim] repeated int32 dimensions = 2; // Shape [batch_size, embedding_dim]
repeated uint32 missing_ids = 3; // Missing node ids repeated uint32 missing_ids = 3; // Missing node ids
} }

View File

@@ -52,4 +52,4 @@ set(FAISS_BUILD_AVX512 OFF CACHE BOOL "" FORCE)
# IMPORTANT: Disable building AVX versions to speed up compilation # IMPORTANT: Disable building AVX versions to speed up compilation
set(FAISS_BUILD_AVX_VERSIONS OFF CACHE BOOL "" FORCE) set(FAISS_BUILD_AVX_VERSIONS OFF CACHE BOOL "" FORCE)
add_subdirectory(third_party/faiss) add_subdirectory(third_party/faiss)

View File

@@ -72,7 +72,11 @@ def read_vector_raw(f, element_fmt_char):
def read_numpy_vector(f, np_dtype, struct_fmt_char): def read_numpy_vector(f, np_dtype, struct_fmt_char):
"""Reads a vector into a NumPy array.""" """Reads a vector into a NumPy array."""
count = -1 # Initialize count for robust error handling count = -1 # Initialize count for robust error handling
print(f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ", end="", flush=True) print(
f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ",
end="",
flush=True,
)
try: try:
count, data_bytes = read_vector_raw(f, struct_fmt_char) count, data_bytes = read_vector_raw(f, struct_fmt_char)
print(f"Count={count}, Bytes={len(data_bytes)}") print(f"Count={count}, Bytes={len(data_bytes)}")
@@ -647,7 +651,10 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
print(f"Error: Input file not found: {input_filename}", file=sys.stderr) print(f"Error: Input file not found: {input_filename}", file=sys.stderr)
return False return False
except MemoryError as e: except MemoryError as e:
print(f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.", file=sys.stderr) print(
f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.",
file=sys.stderr,
)
# Clean up potentially partially written output file? # Clean up potentially partially written output file?
try: try:
os.remove(output_filename) os.remove(output_filename)

View File

@@ -28,6 +28,12 @@ def get_metric_map():
} }
def normalize_l2(data: np.ndarray) -> np.ndarray:
norms = np.linalg.norm(data, axis=1, keepdims=True)
norms[norms == 0] = 1 # Avoid division by zero
return data / norms
@register_backend("hnsw") @register_backend("hnsw")
class HNSWBackend(LeannBackendFactoryInterface): class HNSWBackend(LeannBackendFactoryInterface):
@staticmethod @staticmethod
@@ -76,7 +82,7 @@ class HNSWBuilder(LeannBackendBuilderInterface):
index.hnsw.efConstruction = self.efConstruction index.hnsw.efConstruction = self.efConstruction
if self.distance_metric.lower() == "cosine": if self.distance_metric.lower() == "cosine":
faiss.normalize_L2(data) data = normalize_l2(data)
index.add(data.shape[0], faiss.swig_ptr(data)) index.add(data.shape[0], faiss.swig_ptr(data))
index_file = index_dir / f"{index_prefix}.index" index_file = index_dir / f"{index_prefix}.index"
@@ -186,7 +192,7 @@ class HNSWSearcher(BaseSearcher):
if query.dtype != np.float32: if query.dtype != np.float32:
query = query.astype(np.float32) query = query.astype(np.float32)
if self.distance_metric == "cosine": if self.distance_metric == "cosine":
faiss.normalize_L2(query) query = normalize_l2(query)
params = faiss.SearchParametersHNSW() params = faiss.SearchParametersHNSW()
if zmq_port is not None: if zmq_port is not None:

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.1.14" version = "0.1.15"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.1.14", "leann-core==0.1.15",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",
@@ -24,4 +24,4 @@ build.tool-args = ["-j8"]
# CMake definitions to optimize compilation # CMake definitions to optimize compilation
[tool.scikit-build.cmake.define] [tool.scikit-build.cmake.define]
CMAKE_BUILD_PARALLEL_LEVEL = "8" CMAKE_BUILD_PARALLEL_LEVEL = "8"

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.1.14" version = "0.1.15"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"
@@ -46,4 +46,4 @@ colab = [
leann = "leann.cli:main" leann = "leann.cli:main"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
where = ["src"] where = ["src"]

View File

@@ -7,6 +7,7 @@ import json
import logging import logging
import pickle import pickle
import time import time
import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any, Literal from typing import Any, Literal
@@ -163,6 +164,76 @@ class LeannBuilder:
self.embedding_model = embedding_model self.embedding_model = embedding_model
self.dimensions = dimensions self.dimensions = dimensions
self.embedding_mode = embedding_mode self.embedding_mode = embedding_mode
# Check if we need to use cosine distance for normalized embeddings
normalized_embeddings_models = {
# OpenAI models
("openai", "text-embedding-ada-002"),
("openai", "text-embedding-3-small"),
("openai", "text-embedding-3-large"),
# Voyage AI models
("voyage", "voyage-2"),
("voyage", "voyage-3"),
("voyage", "voyage-large-2"),
("voyage", "voyage-multilingual-2"),
("voyage", "voyage-code-2"),
# Cohere models
("cohere", "embed-english-v3.0"),
("cohere", "embed-multilingual-v3.0"),
("cohere", "embed-english-light-v3.0"),
("cohere", "embed-multilingual-light-v3.0"),
}
# Also check for patterns in model names
is_normalized = False
current_model_lower = embedding_model.lower()
current_mode_lower = embedding_mode.lower()
# Check exact matches
for mode, model in normalized_embeddings_models:
if (current_mode_lower == mode and current_model_lower == model) or (
mode in current_mode_lower and model in current_model_lower
):
is_normalized = True
break
# Check patterns
if not is_normalized:
# OpenAI patterns
if "openai" in current_mode_lower or "openai" in current_model_lower:
if any(
pattern in current_model_lower
for pattern in ["text-embedding", "ada", "3-small", "3-large"]
):
is_normalized = True
# Voyage patterns
elif "voyage" in current_mode_lower or "voyage" in current_model_lower:
is_normalized = True
# Cohere patterns
elif "cohere" in current_mode_lower or "cohere" in current_model_lower:
if "embed" in current_model_lower:
is_normalized = True
# Handle distance metric
if is_normalized and "distance_metric" not in backend_kwargs:
backend_kwargs["distance_metric"] = "cosine"
warnings.warn(
f"Detected normalized embeddings model '{embedding_model}' with mode '{embedding_mode}'. "
f"Automatically setting distance_metric='cosine' for optimal performance. "
f"Normalized embeddings (L2 norm = 1) should use cosine similarity instead of MIPS.",
UserWarning,
stacklevel=2,
)
elif is_normalized and backend_kwargs.get("distance_metric", "").lower() != "cosine":
current_metric = backend_kwargs.get("distance_metric", "mips")
warnings.warn(
f"Warning: Using '{current_metric}' distance metric with normalized embeddings model "
f"'{embedding_model}' may lead to suboptimal search results. "
f"Consider using 'cosine' distance metric for better performance.",
UserWarning,
stacklevel=2,
)
self.backend_kwargs = backend_kwargs self.backend_kwargs = backend_kwargs
self.chunks: list[dict[str, Any]] = [] self.chunks: list[dict[str, Any]] = []

View File

@@ -245,7 +245,11 @@ def search_hf_models_fuzzy(query: str, limit: int = 10) -> list[str]:
# HF Hub's search is already fuzzy! It handles typos and partial matches # HF Hub's search is already fuzzy! It handles typos and partial matches
models = list_models( models = list_models(
search=query, filter="text-generation", sort="downloads", direction=-1, limit=limit search=query,
filter="text-generation",
sort="downloads",
direction=-1,
limit=limit,
) )
model_names = [model.id if hasattr(model, "id") else str(model) for model in models] model_names = [model.id if hasattr(model, "id") else str(model) for model in models]
@@ -582,7 +586,11 @@ class HFChat(LLMInterface):
# Tokenize input # Tokenize input
inputs = self.tokenizer( inputs = self.tokenizer(
formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048 formatted_prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=2048,
) )
# Move inputs to device # Move inputs to device

View File

@@ -16,25 +16,24 @@ uv pip install leann[diskann]
```python ```python
from leann import LeannBuilder, LeannSearcher, LeannChat from leann import LeannBuilder, LeannSearcher, LeannChat
from pathlib import Path
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
# Build an index # Build an index
builder = LeannBuilder(backend_name="hnsw") builder = LeannBuilder(backend_name="hnsw")
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.") builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
builder.build_index("my_index.leann") builder.add_text("Tung Tung Tung Sahur called—they need their bananacrocodile hybrid back")
builder.build_index(INDEX_PATH)
# Search # Search
searcher = LeannSearcher("my_index.leann") searcher = LeannSearcher(INDEX_PATH)
results = searcher.search("storage savings", top_k=3) results = searcher.search("fantastical AI-generated creatures", top_k=1)
# Chat with your data # Chat with your data
chat = LeannChat("my_index.leann", llm_config={"type": "ollama", "model": "llama3.2:1b"}) chat = LeannChat(INDEX_PATH, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"})
response = chat.ask("How much storage does LEANN save?") response = chat.ask("How much storage does LEANN save?", top_k=1)
``` ```
## Documentation
For full documentation, visit [https://leann.readthedocs.io](https://leann.readthedocs.io)
## License ## License
MIT License MIT License

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.1.14" version = "0.1.15"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"
@@ -36,7 +36,5 @@ diskann = [
] ]
[project.urls] [project.urls]
Homepage = "https://github.com/yourusername/leann" Repository = "https://github.com/yichuan-w/LEANN"
Documentation = "https://leann.readthedocs.io" Issues = "https://github.com/yichuan-w/LEANN/issues"
Repository = "https://github.com/yourusername/leann"
Issues = "https://github.com/yourusername/leann/issues"

View File

@@ -1,6 +1,6 @@
import json import json
import sqlite3 import sqlite3
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ElementTree
from pathlib import Path from pathlib import Path
from typing import Annotated from typing import Annotated
@@ -26,7 +26,7 @@ def get_safe_path(s: str) -> str:
def process_history(history: str): def process_history(history: str):
if history.startswith("<?xml") or history.startswith("<msg>"): if history.startswith("<?xml") or history.startswith("<msg>"):
try: try:
root = ET.fromstring(history) root = ElementTree.fromstring(history)
title = root.find(".//title").text if root.find(".//title") is not None else None title = root.find(".//title").text if root.find(".//title") is not None else None
quoted = ( quoted = (
root.find(".//refermsg/content").text root.find(".//refermsg/content").text
@@ -52,7 +52,8 @@ def get_message(history: dict | str):
def export_chathistory(user_id: str): def export_chathistory(user_id: str):
res = requests.get( res = requests.get(
"http://localhost:48065/wechat/chatlog", params={"userId": user_id, "count": 100000} "http://localhost:48065/wechat/chatlog",
params={"userId": user_id, "count": 100000},
).json() ).json()
for i in range(len(res["chatLogs"])): for i in range(len(res["chatLogs"])):
res["chatLogs"][i]["content"] = process_history(res["chatLogs"][i]["content"]) res["chatLogs"][i]["content"] = process_history(res["chatLogs"][i]["content"])
@@ -116,7 +117,8 @@ def export_sqlite(
all_users = requests.get("http://localhost:48065/wechat/allcontacts").json() all_users = requests.get("http://localhost:48065/wechat/allcontacts").json()
for user in tqdm(all_users): for user in tqdm(all_users):
cursor.execute( cursor.execute(
"INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)", (user["arg"], user["title"]) "INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)",
(user["arg"], user["title"]),
) )
usr_chatlog = export_chathistory(user["arg"]) usr_chatlog = export_chathistory(user["arg"])
for msg in usr_chatlog: for msg in usr_chatlog:

View File

@@ -53,6 +53,7 @@ dev = [
"ruff>=0.1.0", "ruff>=0.1.0",
"matplotlib", "matplotlib",
"huggingface-hub>=0.20.0", "huggingface-hub>=0.20.0",
"pre-commit>=3.5.0",
] ]
diskann = [ diskann = [

View File

@@ -19,16 +19,16 @@ uv pip install build twine delocate auditwheel scikit-build-core cmake pybind11
build_package() { build_package() {
local package_dir=$1 local package_dir=$1
local package_name=$(basename $package_dir) local package_name=$(basename $package_dir)
echo "Building $package_name..." echo "Building $package_name..."
cd $package_dir cd $package_dir
# Clean previous builds # Clean previous builds
rm -rf dist/ build/ _skbuild/ rm -rf dist/ build/ _skbuild/
# Build directly with pip wheel (avoids sdist issues) # Build directly with pip wheel (avoids sdist issues)
pip wheel . --no-deps -w dist pip wheel . --no-deps -w dist
# Repair wheel for binary packages # Repair wheel for binary packages
if [[ "$package_name" != "leann-core" ]] && [[ "$package_name" != "leann" ]]; then if [[ "$package_name" != "leann-core" ]] && [[ "$package_name" != "leann" ]]; then
if [[ "$OSTYPE" == "darwin"* ]]; then if [[ "$OSTYPE" == "darwin"* ]]; then
@@ -57,7 +57,7 @@ build_package() {
fi fi
fi fi
fi fi
echo "Built wheels in $package_dir/dist/" echo "Built wheels in $package_dir/dist/"
ls -la dist/ ls -la dist/
cd - > /dev/null cd - > /dev/null
@@ -84,4 +84,4 @@ else
fi fi
echo -e "\nBuild complete! Test with:" echo -e "\nBuild complete! Test with:"
echo "uv pip install packages/*/dist/*.whl" echo "uv pip install packages/*/dist/*.whl"

View File

@@ -28,4 +28,4 @@ else
fi fi
echo "✅ Version updated to $NEW_VERSION" echo "✅ Version updated to $NEW_VERSION"
echo "✅ Dependencies updated to use leann-core==$NEW_VERSION" echo "✅ Dependencies updated to use leann-core==$NEW_VERSION"

View File

@@ -15,4 +15,4 @@ VERSION=$1
git add . && git commit -m "chore: bump version to $VERSION" && git push git add . && git commit -m "chore: bump version to $VERSION" && git push
# Create release (triggers CI) # Create release (triggers CI)
gh release create v$VERSION --generate-notes gh release create v$VERSION --generate-notes

View File

@@ -27,4 +27,4 @@ else
else else
echo "Cancelled" echo "Cancelled"
fi fi
fi fi

View File

@@ -58,7 +58,8 @@ class GraphWrapper:
self.graph = torch.cuda.CUDAGraph() self.graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(self.graph): with torch.cuda.graph(self.graph):
self.static_output = self.model( self.static_output = self.model(
input_ids=self.static_input, attention_mask=self.static_attention_mask input_ids=self.static_input,
attention_mask=self.static_attention_mask,
) )
self.use_cuda_graph = True self.use_cuda_graph = True
else: else:
@@ -82,7 +83,10 @@ class GraphWrapper:
def _warmup(self, num_warmup: int = 3): def _warmup(self, num_warmup: int = 3):
with torch.no_grad(): with torch.no_grad():
for _ in range(num_warmup): for _ in range(num_warmup):
self.model(input_ids=self.static_input, attention_mask=self.static_attention_mask) self.model(
input_ids=self.static_input,
attention_mask=self.static_attention_mask,
)
def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
if self.use_cuda_graph: if self.use_cuda_graph:
@@ -261,7 +265,10 @@ class Benchmark:
# print size # print size
print(f"in_features: {in_features}, out_features: {out_features}") print(f"in_features: {in_features}, out_features: {out_features}")
new_module = bnb.nn.Linear8bitLt( new_module = bnb.nn.Linear8bitLt(
in_features, out_features, bias=bias, has_fp16_weights=False in_features,
out_features,
bias=bias,
has_fp16_weights=False,
) )
# Copy weights and bias # Copy weights and bias
@@ -350,8 +357,6 @@ class Benchmark:
# Try xformers if available (only on CUDA) # Try xformers if available (only on CUDA)
if torch.cuda.is_available(): if torch.cuda.is_available():
try: try:
from xformers.ops import memory_efficient_attention # noqa: F401
if hasattr(model, "enable_xformers_memory_efficient_attention"): if hasattr(model, "enable_xformers_memory_efficient_attention"):
model.enable_xformers_memory_efficient_attention() model.enable_xformers_memory_efficient_attention()
print("- Enabled xformers memory efficient attention") print("- Enabled xformers memory efficient attention")
@@ -427,7 +432,11 @@ class Benchmark:
else "cpu" else "cpu"
) )
return torch.randint( return torch.randint(
0, 1000, (batch_size, self.config.seq_length), device=device, dtype=torch.long 0,
1000,
(batch_size, self.config.seq_length),
device=device,
dtype=torch.long,
) )
def _run_inference( def _run_inference(

View File

@@ -7,7 +7,7 @@ This directory contains comprehensive sanity checks for the Leann system, ensuri
### `test_distance_functions.py` ### `test_distance_functions.py`
Tests all supported distance functions across DiskANN backend: Tests all supported distance functions across DiskANN backend:
-**MIPS** (Maximum Inner Product Search) -**MIPS** (Maximum Inner Product Search)
-**L2** (Euclidean Distance) -**L2** (Euclidean Distance)
-**Cosine** (Cosine Similarity) -**Cosine** (Cosine Similarity)
```bash ```bash
@@ -27,7 +27,7 @@ uv run python tests/sanity_checks/test_l2_verification.py
### `test_sanity_check.py` ### `test_sanity_check.py`
Comprehensive end-to-end verification including: Comprehensive end-to-end verification including:
- Distance function testing - Distance function testing
- Embedding model compatibility - Embedding model compatibility
- Search result correctness validation - Search result correctness validation
- Backend integration testing - Backend integration testing
@@ -64,7 +64,7 @@ When all tests pass, you should see:
``` ```
📊 测试结果总结: 📊 测试结果总结:
mips : ✅ 通过 mips : ✅ 通过
l2 : ✅ 通过 l2 : ✅ 通过
cosine : ✅ 通过 cosine : ✅ 通过
🎉 测试完成! 🎉 测试完成!
@@ -98,7 +98,7 @@ pkill -f "embedding_server"
### Typical Timing (3 documents, consumer hardware): ### Typical Timing (3 documents, consumer hardware):
- **Index Building**: 2-5 seconds per distance function - **Index Building**: 2-5 seconds per distance function
- **Search Query**: 50-200ms - **Search Query**: 50-200ms
- **Recompute Mode**: 5-15 seconds (higher accuracy) - **Recompute Mode**: 5-15 seconds (higher accuracy)
### Memory Usage: ### Memory Usage:
@@ -117,4 +117,4 @@ These tests are designed to be run in automated environments:
uv run python tests/sanity_checks/test_l2_verification.py uv run python tests/sanity_checks/test_l2_verification.py
``` ```
The tests are deterministic and should produce consistent results across different platforms. The tests are deterministic and should produce consistent results across different platforms.

View File

@@ -115,7 +115,13 @@ def main():
# --- Plotting --- # --- Plotting ---
print("\n--- Generating Plot ---") print("\n--- Generating Plot ---")
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
plt.plot(BATCH_SIZES, results_torch, marker="o", linestyle="-", label=f"PyTorch ({device})") plt.plot(
BATCH_SIZES,
results_torch,
marker="o",
linestyle="-",
label=f"PyTorch ({device})",
)
plt.plot(BATCH_SIZES, results_mlx, marker="s", linestyle="-", label="MLX") plt.plot(BATCH_SIZES, results_mlx, marker="s", linestyle="-", label="MLX")
plt.title(f"Embedding Performance: MLX vs PyTorch\nModel: {MODEL_NAME_TORCH}") plt.title(f"Embedding Performance: MLX vs PyTorch\nModel: {MODEL_NAME_TORCH}")

View File

@@ -170,7 +170,11 @@ class Benchmark:
def _create_random_batch(self, batch_size: int) -> torch.Tensor: def _create_random_batch(self, batch_size: int) -> torch.Tensor:
return torch.randint( return torch.randint(
0, 1000, (batch_size, self.config.seq_length), device=self.device, dtype=torch.long 0,
1000,
(batch_size, self.config.seq_length),
device=self.device,
dtype=torch.long,
) )
def _run_inference(self, input_ids: torch.Tensor) -> float: def _run_inference(self, input_ids: torch.Tensor) -> float:
@@ -256,7 +260,11 @@ def run_mlx_benchmark():
"""Run MLX-specific benchmark""" """Run MLX-specific benchmark"""
if not MLX_AVAILABLE: if not MLX_AVAILABLE:
print("MLX not available, skipping MLX benchmark") print("MLX not available, skipping MLX benchmark")
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": "MLX not available"} return {
"max_throughput": 0.0,
"avg_throughput": 0.0,
"error": "MLX not available",
}
config = BenchmarkConfig(model_path="mlx-community/all-MiniLM-L6-v2-4bit", use_mlx=True) config = BenchmarkConfig(model_path="mlx-community/all-MiniLM-L6-v2-4bit", use_mlx=True)
@@ -265,7 +273,11 @@ def run_mlx_benchmark():
results = benchmark.run() results = benchmark.run()
if not results: if not results:
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": "No valid results"} return {
"max_throughput": 0.0,
"avg_throughput": 0.0,
"error": "No valid results",
}
max_throughput = max(results[batch_size]["throughput"] for batch_size in results) max_throughput = max(results[batch_size]["throughput"] for batch_size in results)
avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results]) avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results])

10
uv.lock generated
View File

@@ -1847,7 +1847,7 @@ wheels = [
[[package]] [[package]]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.1.13" version = "0.1.14"
source = { editable = "packages/leann-backend-diskann" } source = { editable = "packages/leann-backend-diskann" }
dependencies = [ dependencies = [
{ name = "leann-core" }, { name = "leann-core" },
@@ -1858,14 +1858,14 @@ dependencies = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" }, { name = "leann-core", specifier = "==0.1.14" },
{ name = "numpy" }, { name = "numpy" },
{ name = "protobuf", specifier = ">=3.19.0" }, { name = "protobuf", specifier = ">=3.19.0" },
] ]
[[package]] [[package]]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.1.13" version = "0.1.14"
source = { editable = "packages/leann-backend-hnsw" } source = { editable = "packages/leann-backend-hnsw" }
dependencies = [ dependencies = [
{ name = "leann-core" }, { name = "leann-core" },
@@ -1877,7 +1877,7 @@ dependencies = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" }, { name = "leann-core", specifier = "==0.1.14" },
{ name = "msgpack", specifier = ">=1.0.0" }, { name = "msgpack", specifier = ">=1.0.0" },
{ name = "numpy" }, { name = "numpy" },
{ name = "pyzmq", specifier = ">=23.0.0" }, { name = "pyzmq", specifier = ">=23.0.0" },
@@ -1885,7 +1885,7 @@ requires-dist = [
[[package]] [[package]]
name = "leann-core" name = "leann-core"
version = "0.1.13" version = "0.1.14"
source = { editable = "packages/leann-core" } source = { editable = "packages/leann-core" }
dependencies = [ dependencies = [
{ name = "accelerate" }, { name = "accelerate" },