Compare commits

..

24 Commits

Author SHA1 Message Date
Andy Lee
3de0a94efc docs: align cli args and README 2025-08-15 12:03:11 -07:00
Andy Lee
58c12e3eed docs: leann help 2025-08-15 11:51:38 -07:00
Andy Lee
92739c7899 docs: boolean flags 2025-08-15 11:50:46 -07:00
Andy Lee
6709afe38b fix: hang on warnings 2025-08-15 01:01:51 -07:00
Andy Lee
ded0701504 core: auto-cleanup for LeannSearcher/LeannChat (__enter__/__exit__/__del__); ensure server terminate/kill robustness; benchmarks: use searcher.cleanup(); docs: suggest uv run 2025-08-14 14:29:57 -07:00
Andy Lee
e3518a31ed docs: diskann recompute 2025-08-14 14:25:50 -07:00
Andy Lee
d5f6ca61ed benchmarks: unify HNSW & DiskANN into one clean script; isolate groups, fixed ports, warm-up, param complexity 2025-08-14 13:47:53 -07:00
Andy Lee
b13b52e78c benchmarks: fix and extend HNSW+DiskANN recompute vs no-recompute; docs: add fresh numbers and DiskANN notes 2025-08-14 12:18:07 -07:00
Andy Lee
79ca32e87b docs: a real example on recompute 2025-08-14 11:56:10 -07:00
Andy Lee
16f4572fe7 cli: use argparse.BooleanOptionalAction for paired flags (--recompute/--compact) across build/search/ask 2025-08-14 11:22:47 -07:00
Andy Lee
2bd557d1cf hnsw: move pruned/no-recompute assertion into backend; api: drop global assertion; docs: will adjust after benchmarking 2025-08-14 11:08:34 -07:00
Andy Lee
3e162fb177 chore: remove 2025-08-14 01:24:21 -07:00
Andy Lee
b988f0ab5b cli: unify flags to --recompute/--no-recompute for build/search/ask; docs: update references 2025-08-14 01:20:14 -07:00
Andy Lee
43cb500ed8 merge: finalize compat resolution (delegate to PassageManager; keep relative hints in meta); resolve conflicts 2025-08-14 01:09:39 -07:00
Andy Lee
0361725323 reader: non-destructive portability (relative hints + fallback); fix comments; sky: refine yaml 2025-08-14 01:05:01 -07:00
Andy Lee
3f81861cba feat: auto compact for hnsw when recompute 2025-08-14 00:09:57 -07:00
Andy Lee
fa2a775867 docs+sky: simplify SkyPilot flow (auto-build on launch, rsync copy-back); clarify HNSW auto non-compact when no-recompute 2025-08-13 14:30:14 -07:00
Andy Lee
737dfc960c hnsw: auto-disable compact when --no-recompute is used; docs: expand SkyPilot with -e overrides and copy-back example 2025-08-13 14:25:16 -07:00
Andy Lee
c994635af6 sky: expand leann-build.yaml with configurable params and flags (backend, recompute, compact, embedding options) 2025-08-13 14:18:48 -07:00
Andy Lee
23b80647c5 docs: dedupe recomputation guidance; keep single Low-resource setups section 2025-08-13 14:10:10 -07:00
Andy Lee
50121972ee cli: add --no-recompute and --no-recompute-embeddings flags; docs: clarify HNSW requires --no-compact when disabling recompute 2025-08-13 14:09:05 -07:00
Andy Lee
07e5f10204 docs: consolidate low-resource guidance into config guide; README points to it 2025-08-13 14:08:23 -07:00
Andy Lee
58711bff7e docs: add low-resource note in README; point to config guide; suggest OpenAI embeddings, SkyPilot remote build, and --no-recompute 2025-08-13 14:06:22 -07:00
Andy Lee
a69464eb16 docs: add SkyPilot template and instructions for running embeddings/index build on cloud GPU 2025-08-13 14:01:32 -07:00
25 changed files with 3924 additions and 4689 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
paper_plot/data/big_graph_degree_data.npz filter=lfs diff=lfs merge=lfs -text

View File

@@ -87,7 +87,7 @@ jobs:
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
with: with:
ref: ${{ inputs.ref }} ref: ${{ inputs.ref }}
submodules: recursive submodules: recursive
@@ -98,23 +98,21 @@ jobs:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@v6 uses: astral-sh/setup-uv@v4
- name: Install system dependencies (Ubuntu) - name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux' if: runner.os == 'Linux'
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \ sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
pkg-config libabsl-dev libaio-dev libprotobuf-dev \ pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
patchelf
# Install Intel MKL for DiskANN # Install Intel MKL for DiskANN
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
- name: Install system dependencies (macOS) - name: Install system dependencies (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'
@@ -306,53 +304,3 @@ jobs:
with: with:
name: packages-${{ matrix.os }}-py${{ matrix.python }} name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/ path: packages/*/dist/
arch-smoke:
name: Arch Linux smoke test (install & import)
needs: build
runs-on: ubuntu-latest
container:
image: archlinux:latest
steps:
- name: Prepare system
run: |
pacman -Syu --noconfirm
pacman -S --noconfirm python python-pip gcc git zlib openssl
- name: Download ALL wheel artifacts from this run
uses: actions/download-artifact@v5
with:
# Don't specify name, download all artifacts
path: ./wheels
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Create virtual environment and install wheels
run: |
uv venv
source .venv/bin/activate || source .venv/Scripts/activate
uv pip install --find-links wheels leann-core
uv pip install --find-links wheels leann-backend-hnsw
uv pip install --find-links wheels leann-backend-diskann
uv pip install --find-links wheels leann
- name: Import & tiny runtime check
env:
OMP_NUM_THREADS: 1
MKL_NUM_THREADS: 1
run: |
source .venv/bin/activate || source .venv/Scripts/activate
python - <<'PY'
import leann
import leann_backend_hnsw as h
import leann_backend_diskann as d
from leann import LeannBuilder, LeannSearcher
b = LeannBuilder(backend_name="hnsw")
b.add_text("hello arch")
b.build_index("arch_demo.leann")
s = LeannSearcher("arch_demo.leann")
print("search:", s.search("hello", top_k=1))
PY

View File

@@ -14,6 +14,6 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: lycheeverse/lychee-action@v2 - uses: lycheeverse/lychee-action@v2
with: with:
args: --no-progress --insecure --user-agent 'curl/7.68.0' README.md docs/ apps/ examples/ benchmarks/ args: --no-progress --insecure README.md docs/ apps/ examples/ benchmarks/
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

1
.gitignore vendored
View File

@@ -18,7 +18,6 @@ demo/experiment_results/**/*.json
*.eml *.eml
*.emlx *.emlx
*.json *.json
!.vscode/*.json
*.sh *.sh
*.txt *.txt
!CMakeLists.txt !CMakeLists.txt

View File

@@ -1,5 +0,0 @@
{
"recommendations": [
"charliermarsh.ruff",
]
}

22
.vscode/settings.json vendored
View File

@@ -1,22 +0,0 @@
{
"python.defaultInterpreterPath": ".venv/bin/python",
"python.terminal.activateEnvironment": true,
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
},
"editor.insertSpaces": true,
"editor.tabSize": 4
},
"ruff.enable": true,
"files.watcherExclude": {
"**/.venv/**": true,
"**/__pycache__/**": true,
"**/*.egg-info/**": true,
"**/build/**": true,
"**/dist/**": true
}
}

104
README.md
View File

@@ -5,7 +5,7 @@
<p align="center"> <p align="center">
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions"> <img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status"> <img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%26%20Arch%20%26%20WSL%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform"> <img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License"> <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration"> <img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
</p> </p>
@@ -31,7 +31,7 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg
<img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="70%"> <img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="70%">
</p> </p>
> **The numbers speak for themselves:** Index 60 million text chunks in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks for different applications below ↓](#-storage-comparison) > **The numbers speak for themselves:** Index 60 million text chunks in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks for different applications below ↓](#storage-comparison)
🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service". 🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service".
@@ -70,8 +70,8 @@ uv venv
source .venv/bin/activate source .venv/bin/activate
uv pip install leann uv pip install leann
``` ```
<!--
> Low-resource? See “Low-resource setups” in the [Configuration Guide](docs/configuration-guide.md#low-resource-setups). --> > Low-resource? See “Low-resource setups” in the [Configuration Guide](docs/configuration-guide.md#low-resource-setups).
<details> <details>
<summary> <summary>
@@ -87,60 +87,15 @@ git submodule update --init --recursive
``` ```
**macOS:** **macOS:**
Note: DiskANN requires MacOS 13.3 or later.
```bash ```bash
brew install libomp boost protobuf zeromq pkgconf brew install llvm libomp boost protobuf zeromq pkgconf
uv sync --extra diskann CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync
``` ```
**Linux (Ubuntu/Debian):** **Linux:**
Note: On Ubuntu 20.04, you may need to build a newer Abseil and pin Protobuf (e.g., v3.20.x) for building DiskANN. See [Issue #30](https://github.com/yichuan-w/LEANN/issues/30) for a step-by-step note.
You can manually install [Intel oneAPI MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) instead of `libmkl-full-dev` for DiskANN. You can also use `libopenblas-dev` for building HNSW only, by removing `--extra diskann` in the command below.
```bash ```bash
sudo apt-get update && sudo apt-get install -y \ sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \ uv sync
pkg-config libabsl-dev libaio-dev libprotobuf-dev \
libmkl-full-dev
uv sync --extra diskann
```
**Linux (Arch Linux):**
```bash
sudo pacman -Syu && sudo pacman -S --needed base-devel cmake pkgconf git gcc \
boost boost-libs protobuf abseil-cpp libaio zeromq
# For MKL in DiskANN
sudo pacman -S --needed base-devel git
git clone https://aur.archlinux.org/paru-bin.git
cd paru-bin && makepkg -si
paru -S intel-oneapi-mkl intel-oneapi-compiler
source /opt/intel/oneapi/setvars.sh
uv sync --extra diskann
```
**Linux (RHEL / CentOS Stream / Oracle / Rocky / AlmaLinux):**
See [Issue #50](https://github.com/yichuan-w/LEANN/issues/50) for more details.
```bash
sudo dnf groupinstall -y "Development Tools"
sudo dnf install -y libomp-devel boost-devel protobuf-compiler protobuf-devel \
abseil-cpp-devel libaio-devel zeromq-devel pkgconf-pkg-config
# For MKL in DiskANN
sudo dnf install -y intel-oneapi-mkl intel-oneapi-mkl-devel \
intel-oneapi-openmp || sudo dnf install -y intel-oneapi-compiler
source /opt/intel/oneapi/setvars.sh
uv sync --extra diskann
``` ```
</details> </details>
@@ -471,21 +426,21 @@ Once the index is built, you can ask questions like:
**The future of code assistance is here.** Transform your development workflow with LEANN's native MCP integration for Claude Code. Index your entire codebase and get intelligent code assistance directly in your IDE. **The future of code assistance is here.** Transform your development workflow with LEANN's native MCP integration for Claude Code. Index your entire codebase and get intelligent code assistance directly in your IDE.
**Key features:** **Key features:**
- 🔍 **Semantic code search** across your entire project, fully local index and lightweight - 🔍 **Semantic code search** across your entire project
- 📚 **Context-aware assistance** for debugging and development - 📚 **Context-aware assistance** for debugging and development
- 🚀 **Zero-config setup** with automatic language detection - 🚀 **Zero-config setup** with automatic language detection
```bash ```bash
# Install LEANN globally for MCP integration # Install LEANN globally for MCP integration
uv tool install leann-core --with leann uv tool install leann-core
claude mcp add --scope user leann-server -- leann_mcp
# Setup is automatic - just start using Claude Code! # Setup is automatic - just start using Claude Code!
``` ```
Try our fully agentic pipeline with auto query rewriting, semantic search planning, and more: Try our fully agentic pipeline with auto query rewriting, semantic search planning, and more:
![LEANN MCP Integration](assets/mcp_leann.png) ![LEANN MCP Integration](assets/mcp_leann.png)
**🔥 Ready to supercharge your coding?** [Complete Setup Guide →](packages/leann-mcp/README.md) **Ready to supercharge your coding?** [Complete Setup Guide →](packages/leann-mcp/README.md)
## 🖥️ Command Line Interface ## 🖥️ Command Line Interface
@@ -502,8 +457,7 @@ leann --help
**To make it globally available:** **To make it globally available:**
```bash ```bash
# Install the LEANN CLI globally using uv tool # Install the LEANN CLI globally using uv tool
uv tool install leann-core --with leann uv tool install leann-core
# Now you can use leann from anywhere without activating venv # Now you can use leann from anywhere without activating venv
leann --help leann --help
@@ -527,9 +481,6 @@ leann ask my-docs --interactive
# List all your indexes # List all your indexes
leann list leann list
# Remove an index
leann remove my-docs
``` ```
**Key CLI features:** **Key CLI features:**
@@ -542,7 +493,7 @@ leann remove my-docs
<details> <details>
<summary><strong>📋 Click to expand: Complete CLI Reference</strong></summary> <summary><strong>📋 Click to expand: Complete CLI Reference</strong></summary>
You can use `leann --help`, or `leann build --help`, `leann search --help`, `leann ask --help`, `leann list --help`, `leann remove --help` to get the complete CLI reference. You can use `leann --help`, or `leann build --help`, `leann search --help`, `leann ask --help` to get the complete CLI reference.
**Build Command:** **Build Command:**
```bash ```bash
@@ -580,31 +531,6 @@ Options:
--top-k N Retrieval count (default: 20) --top-k N Retrieval count (default: 20)
``` ```
**List Command:**
```bash
leann list
# Lists all indexes across all projects with status indicators:
# ✅ - Index is complete and ready to use
# ❌ - Index is incomplete or corrupted
# 📁 - CLI-created index (in .leann/indexes/)
# 📄 - App-created index (*.leann.meta.json files)
```
**Remove Command:**
```bash
leann remove INDEX_NAME [OPTIONS]
Options:
--force, -f Force removal without confirmation
# Smart removal: automatically finds and safely removes indexes
# - Shows all matching indexes across projects
# - Requires confirmation for cross-project removal
# - Interactive selection when multiple matches found
# - Supports both CLI and app-created indexes
```
</details> </details>
## 🏗️ Architecture & How It Works ## 🏗️ Architecture & How It Works

View File

@@ -10,7 +10,6 @@ from typing import Any
import dotenv import dotenv
from leann.api import LeannBuilder, LeannChat from leann.api import LeannBuilder, LeannChat
from leann.registry import register_project_directory
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv() dotenv.load_dotenv()
@@ -215,11 +214,6 @@ class BaseRAGExample(ABC):
builder.build_index(index_path) builder.build_index(index_path)
print(f"Index saved to: {index_path}") print(f"Index saved to: {index_path}")
# Register project directory so leann list can discover this index
# The index is saved as args.index_dir/index_name.leann
# We want to register the current working directory where the app is run
register_project_directory(Path.cwd())
return index_path return index_path
async def run_interactive_chat(self, args, index_path: str): async def run_interactive_chat(self, args, index_path: str):

View File

@@ -183,9 +183,6 @@ class Benchmark:
start_time = time.time() start_time = time.time()
with torch.no_grad(): with torch.no_grad():
self.model(input_ids=input_ids, attention_mask=attention_mask) self.model(input_ids=input_ids, attention_mask=attention_mask)
# mps sync
if torch.backends.mps.is_available():
torch.mps.synchronize()
end_time = time.time() end_time = time.time()
return end_time - start_time return end_time - start_time

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.3.1" version = "0.2.9"
dependencies = ["leann-core==0.3.1", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.2.9", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.3.1" version = "0.2.9"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.3.1", "leann-core==0.2.9",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.3.1" version = "0.2.9"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -46,7 +46,6 @@ def compute_embeddings(
- "sentence-transformers": Use sentence-transformers library (default) - "sentence-transformers": Use sentence-transformers library (default)
- "mlx": Use MLX backend for Apple Silicon - "mlx": Use MLX backend for Apple Silicon
- "openai": Use OpenAI embedding API - "openai": Use OpenAI embedding API
- "gemini": Use Google Gemini embedding API
use_server: Whether to use embedding server (True for search, False for build) use_server: Whether to use embedding server (True for search, False for build)
Returns: Returns:
@@ -307,23 +306,6 @@ class LeannBuilder:
def build_index(self, index_path: str): def build_index(self, index_path: str):
if not self.chunks: if not self.chunks:
raise ValueError("No chunks added.") raise ValueError("No chunks added.")
# Filter out invalid/empty text chunks early to keep passage and embedding counts aligned
valid_chunks: list[dict[str, Any]] = []
skipped = 0
for chunk in self.chunks:
text = chunk.get("text", "")
if isinstance(text, str) and text.strip():
valid_chunks.append(chunk)
else:
skipped += 1
if skipped > 0:
print(
f"Warning: Skipping {skipped} empty/invalid text chunk(s). Processing {len(valid_chunks)} valid chunks"
)
self.chunks = valid_chunks
if not self.chunks:
raise ValueError("All provided chunks are empty or invalid. Nothing to index.")
if self.dimensions is None: if self.dimensions is None:
self.dimensions = len( self.dimensions = len(
compute_embeddings( compute_embeddings(
@@ -614,7 +596,7 @@ class LeannSearcher:
zmq_port=zmq_port, zmq_port=zmq_port,
) )
# logger.info(f" Generated embedding shape: {query_embedding.shape}") # logger.info(f" Generated embedding shape: {query_embedding.shape}")
# time.time() - start_time time.time() - start_time
# logger.info(f" Embedding time: {embedding_time} seconds") # logger.info(f" Embedding time: {embedding_time} seconds")
start_time = time.time() start_time = time.time()
@@ -680,9 +662,8 @@ class LeannSearcher:
This method should be called after you're done using the searcher, This method should be called after you're done using the searcher,
especially in test environments or batch processing scenarios. especially in test environments or batch processing scenarios.
""" """
backend = getattr(self.backend_impl, "embedding_server_manager", None) if hasattr(self.backend_impl, "embedding_server_manager"):
if backend is not None: self.backend_impl.embedding_server_manager.stop_server()
backend.stop_server()
# Enable automatic cleanup patterns # Enable automatic cleanup patterns
def __enter__(self): def __enter__(self):

View File

@@ -680,60 +680,6 @@ class HFChat(LLMInterface):
return response.strip() return response.strip()
class GeminiChat(LLMInterface):
"""LLM interface for Google Gemini models."""
def __init__(self, model: str = "gemini-2.5-flash", api_key: Optional[str] = None):
self.model = model
self.api_key = api_key or os.getenv("GEMINI_API_KEY")
if not self.api_key:
raise ValueError(
"Gemini API key is required. Set GEMINI_API_KEY environment variable or pass api_key parameter."
)
logger.info(f"Initializing Gemini Chat with model='{model}'")
try:
import google.genai as genai
self.client = genai.Client(api_key=self.api_key)
except ImportError:
raise ImportError(
"The 'google-genai' library is required for Gemini models. Please install it with 'uv pip install google-genai'."
)
def ask(self, prompt: str, **kwargs) -> str:
logger.info(f"Sending request to Gemini with model {self.model}")
try:
from google.genai.types import GenerateContentConfig
generation_config = GenerateContentConfig(
temperature=kwargs.get("temperature", 0.7),
max_output_tokens=kwargs.get("max_tokens", 1000),
)
# Handle top_p parameter
if "top_p" in kwargs:
generation_config.top_p = kwargs["top_p"]
response = self.client.models.generate_content(
model=self.model,
contents=prompt,
config=generation_config,
)
# Handle potential None response text
response_text = response.text
if response_text is None:
logger.warning("Gemini returned None response text")
return ""
return response_text.strip()
except Exception as e:
logger.error(f"Error communicating with Gemini: {e}")
return f"Error: Could not get a response from Gemini. Details: {e}"
class OpenAIChat(LLMInterface): class OpenAIChat(LLMInterface):
"""LLM interface for OpenAI models.""" """LLM interface for OpenAI models."""
@@ -847,8 +793,6 @@ def get_llm(llm_config: Optional[dict[str, Any]] = None) -> LLMInterface:
return HFChat(model_name=model or "deepseek-ai/deepseek-llm-7b-chat") return HFChat(model_name=model or "deepseek-ai/deepseek-llm-7b-chat")
elif llm_type == "openai": elif llm_type == "openai":
return OpenAIChat(model=model or "gpt-4o", api_key=llm_config.get("api_key")) return OpenAIChat(model=model or "gpt-4o", api_key=llm_config.get("api_key"))
elif llm_type == "gemini":
return GeminiChat(model=model or "gemini-2.5-flash", api_key=llm_config.get("api_key"))
elif llm_type == "simulated": elif llm_type == "simulated":
return SimulatedChat() return SimulatedChat()
else: else:

View File

@@ -1,14 +1,13 @@
import argparse import argparse
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Union
from llama_index.core import SimpleDirectoryReader from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
from tqdm import tqdm from tqdm import tqdm
from .api import LeannBuilder, LeannChat, LeannSearcher from .api import LeannBuilder, LeannChat, LeannSearcher
from .registry import register_project_directory
def extract_pdf_text_with_pymupdf(file_path: str) -> str: def extract_pdf_text_with_pymupdf(file_path: str) -> str:
@@ -85,7 +84,6 @@ Examples:
leann search my-docs "query" # Search in my-docs index leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes leann list # List all stored indexes
leann remove my-docs # Remove an index (local first, then global)
""", """,
) )
@@ -150,36 +148,6 @@ Examples:
type=str, type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.", help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
) )
build_parser.add_argument(
"--include-hidden",
action=argparse.BooleanOptionalAction,
default=False,
help="Include hidden files and directories (paths starting with '.') during indexing (default: false)",
)
build_parser.add_argument(
"--doc-chunk-size",
type=int,
default=256,
help="Document chunk size in tokens/characters (default: 256)",
)
build_parser.add_argument(
"--doc-chunk-overlap",
type=int,
default=128,
help="Document chunk overlap (default: 128)",
)
build_parser.add_argument(
"--code-chunk-size",
type=int,
default=512,
help="Code chunk size in tokens/lines (default: 512)",
)
build_parser.add_argument(
"--code-chunk-overlap",
type=int,
default=50,
help="Code chunk overlap (default: 50)",
)
# Search command # Search command
search_parser = subparsers.add_parser("search", help="Search documents") search_parser = subparsers.add_parser("search", help="Search documents")
@@ -206,11 +174,6 @@ Examples:
default="global", default="global",
help="Pruning strategy (default: global)", help="Pruning strategy (default: global)",
) )
search_parser.add_argument(
"--non-interactive",
action="store_true",
help="Non-interactive mode: automatically select index without prompting",
)
# Ask command # Ask command
ask_parser = subparsers.add_parser("ask", help="Ask questions") ask_parser = subparsers.add_parser("ask", help="Ask questions")
@@ -258,18 +221,35 @@ Examples:
# List command # List command
subparsers.add_parser("list", help="List all indexes") subparsers.add_parser("list", help="List all indexes")
# Remove command
remove_parser = subparsers.add_parser("remove", help="Remove an index")
remove_parser.add_argument("index_name", help="Index name to remove")
remove_parser.add_argument(
"--force", "-f", action="store_true", help="Force removal without confirmation"
)
return parser return parser
def register_project_dir(self): def register_project_dir(self):
"""Register current project directory in global registry""" """Register current project directory in global registry"""
register_project_directory() global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
current_dir = str(Path.cwd())
# Load existing registry
projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
projects = json.load(f)
except Exception:
projects = []
# Add current directory if not already present
if current_dir not in projects:
projects.append(current_dir)
# Save registry
import json
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
def _build_gitignore_parser(self, docs_dir: str): def _build_gitignore_parser(self, docs_dir: str):
"""Build gitignore parser using gitignore-parser library.""" """Build gitignore parser using gitignore-parser library."""
@@ -329,6 +309,8 @@ Examples:
return False return False
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:")
# Get all project directories with .leann # Get all project directories with .leann
global_registry = Path.home() / ".leann" / "projects.json" global_registry = Path.home() / ".leann" / "projects.json"
all_projects = [] all_projects = []
@@ -354,485 +336,58 @@ Examples:
if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects: if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects:
valid_projects.append(current_path) valid_projects.append(current_path)
# Separate current and other projects if not valid_projects:
other_projects = [] print(
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
for project_path in valid_projects: )
if project_path != current_path: return
other_projects.append(project_path)
print("📚 LEANN Indexes")
print("=" * 50)
total_indexes = 0 total_indexes = 0
current_indexes_count = 0 current_dir = Path.cwd()
# Show current project first (most important) for project_path in valid_projects:
print("\n🏠 Current Project") indexes_dir = project_path / ".leann" / "indexes"
print(f" {current_path}") if not indexes_dir.exists():
print(" " + "" * 45)
current_indexes = self._discover_indexes_in_project(current_path)
if current_indexes:
for idx in current_indexes:
total_indexes += 1
current_indexes_count += 1
type_icon = "📁" if idx["type"] == "cli" else "📄"
print(f" {current_indexes_count}. {type_icon} {idx['name']} {idx['status']}")
if idx["size_mb"] > 0:
print(f" 📦 Size: {idx['size_mb']:.1f} MB")
else:
print(" 📭 No indexes in current project")
# Show other projects (reference information)
if other_projects:
print("\n\n🗂️ Other Projects")
print(" " + "" * 45)
for project_path in other_projects:
project_indexes = self._discover_indexes_in_project(project_path)
if not project_indexes:
continue
print(f"\n 📂 {project_path.name}")
print(f" {project_path}")
for idx in project_indexes:
total_indexes += 1
type_icon = "📁" if idx["type"] == "cli" else "📄"
print(f"{type_icon} {idx['name']} {idx['status']}")
if idx["size_mb"] > 0:
print(f" 📦 {idx['size_mb']:.1f} MB")
# Summary and usage info
print("\n" + "=" * 50)
if total_indexes == 0:
print("💡 Get started:")
print(" leann build my-docs --docs ./documents")
else:
# Count only projects that have at least one discoverable index
projects_count = sum(
1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0
)
print(f"📊 Total: {total_indexes} indexes across {projects_count} projects")
if current_indexes_count > 0:
print("\n💫 Quick start (current project):")
# Get first index from current project for example
current_indexes_dir = current_path / ".leann" / "indexes"
if current_indexes_dir.exists():
current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
if current_index_dirs:
example_name = current_index_dirs[0].name
print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive")
else:
print("\n💡 Create your first index:")
print(" leann build my-docs --docs ./documents")
def _discover_indexes_in_project(self, project_path: Path):
"""Discover all indexes in a project directory (both CLI and apps formats)"""
indexes = []
# 1. CLI format: .leann/indexes/index_name/
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists():
for index_dir in cli_indexes_dir.iterdir():
if index_dir.is_dir():
meta_file = index_dir / "documents.leann.meta.json"
status = "" if meta_file.exists() else ""
size_mb = 0
if meta_file.exists():
try:
size_mb = sum(
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
) / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": index_dir.name,
"type": "cli",
"status": status,
"size_mb": size_mb,
"path": index_dir,
}
)
# 2. Apps format: *.leann.meta.json files anywhere in the project
cli_indexes_dir = project_path / ".leann" / "indexes"
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file():
# Skip CLI-built indexes (which store meta under .leann/indexes/<name>/)
try:
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
# Use the parent directory name as the app index display name
display_name = meta_file.parent.name
# Extract file base used to store files
file_base = meta_file.name.replace(".leann.meta.json", "")
# Apps indexes are considered complete if the .leann.meta.json file exists
status = ""
# Calculate total size of all related files (use file base)
size_mb = 0
try:
index_dir = meta_file.parent
for related_file in index_dir.glob(f"{file_base}.leann*"):
size_mb += related_file.stat().st_size / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": display_name,
"type": "app",
"status": status,
"size_mb": size_mb,
"path": meta_file,
}
)
return indexes
def remove_index(self, index_name: str, force: bool = False):
"""Safely remove an index - always show all matches for transparency"""
# Always do a comprehensive search for safety
print(f"🔍 Searching for all indexes named '{index_name}'...")
all_matches = self._find_all_matching_indexes(index_name)
if not all_matches:
print(f"❌ Index '{index_name}' not found in any project.")
return False
if len(all_matches) == 1:
return self._remove_single_match(all_matches[0], index_name, force)
else:
return self._remove_from_multiple_matches(all_matches, index_name, force)
def _find_all_matching_indexes(self, index_name: str):
"""Find all indexes with the given name across all projects"""
matches = []
# Get all registered projects
global_registry = Path.home() / ".leann" / "projects.json"
all_projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
all_projects = json.load(f)
except Exception:
pass
# Always include current project
current_path = Path.cwd()
if str(current_path) not in all_projects:
all_projects.append(str(current_path))
# Search across all projects
for project_dir in all_projects:
project_path = Path(project_dir)
if not project_path.exists():
continue continue
# 1) CLI-format index under .leann/indexes/<name> index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()]
index_dir = project_path / ".leann" / "indexes" / index_name if not index_dirs:
if index_dir.exists(): continue
is_current = project_path == current_path
matches.append(
{
"project_path": project_path,
"index_dir": index_dir,
"is_current": is_current,
"kind": "cli",
}
)
# 2) App-format indexes # Show project header
# We support two ways of addressing apps: if project_path == current_dir:
# a) by the file base (e.g., `pdf_documents`) print(f"\n📁 Current project ({project_path}):")
# b) by the parent directory name (e.g., `new_txt`)
seen_app_meta = set()
# 2a) by file base
for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"):
if meta_file.is_file():
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
# 2b) by parent directory name
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file() and meta_file.parent.name == index_name:
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
# Sort: current project first, then by project name
matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name))
return matches
def _remove_single_match(self, match, index_name: str, force: bool):
"""Handle removal when only one match is found"""
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
location_info = "current project"
emoji = "🏠"
else:
location_info = f"other project '{project_path.name}'"
emoji = "📂"
print(f"✅ Found 1 index named '{index_name}':")
print(f" {emoji} Location: {location_info}")
if kind == "cli":
print(f" 📍 Path: {project_path / '.leann' / 'indexes' / index_name}")
else:
print(f" 📍 Meta: {match['meta_file']}")
if not force:
if not is_current:
print("\n⚠️ CROSS-PROJECT REMOVAL!")
print(" This will delete the index from another project.")
response = input(f" ❓ Confirm removal from {location_info}? (y/N): ").strip().lower()
if response not in ["y", "yes"]:
print(" ❌ Removal cancelled.")
return False
if kind == "cli":
return self._delete_index_directory(
match["index_dir"],
index_name,
project_path if not is_current else None,
is_app=False,
)
else:
return self._delete_index_directory(
match["files_dir"],
match.get("display_name", index_name),
project_path if not is_current else None,
is_app=True,
meta_file=match.get("meta_file"),
app_file_base=match.get("file_base"),
)
def _remove_from_multiple_matches(self, matches, index_name: str, force: bool):
"""Handle removal when multiple matches are found"""
print(f"⚠️ Found {len(matches)} indexes named '{index_name}':")
print(" " + "" * 50)
for i, match in enumerate(matches, 1):
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
print(f" {i}. 🏠 Current project ({'CLI' if kind == 'cli' else 'APP'})")
else: else:
print(f" {i}. 📂 {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})") print(f"\n📂 {project_path}:")
# Show path details for index_dir in index_dirs:
if kind == "cli": total_indexes += 1
print(f" 📍 {project_path / '.leann' / 'indexes' / index_name}") index_name = index_dir.name
else: meta_file = index_dir / "documents.leann.meta.json"
print(f" 📍 {match['meta_file']}") status = "" if meta_file.exists() else ""
# Show size info print(f" {total_indexes}. {index_name} [{status}]")
try: if status == "":
if kind == "cli": size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
size_mb = sum( 1024 * 1024
f.stat().st_size for f in match["index_dir"].iterdir() if f.is_file()
) / (1024 * 1024)
else:
file_base = match.get("file_base")
size_mb = 0.0
if file_base:
size_mb = sum(
f.stat().st_size
for f in match["files_dir"].glob(f"{file_base}.leann*")
if f.is_file()
) / (1024 * 1024)
print(f" 📦 Size: {size_mb:.1f} MB")
except (OSError, PermissionError):
pass
print(" " + "" * 50)
if force:
print(" ❌ Multiple matches found, but --force specified.")
print(" Please run without --force to choose which one to remove.")
return False
try:
choice = input(
f" ❓ Which one to remove? (1-{len(matches)}, or 'c' to cancel): "
).strip()
if choice.lower() == "c":
print(" ❌ Removal cancelled.")
return False
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(matches):
selected_match = matches[choice_idx]
project_path = selected_match["project_path"]
is_current = selected_match["is_current"]
kind = selected_match.get("kind", "cli")
location = "current project" if is_current else f"'{project_path.name}' project"
print(f" 🎯 Selected: Remove from {location}")
# Final confirmation for safety
confirm = input(
f" ❓ FINAL CONFIRMATION - Type '{index_name}' to proceed: "
).strip()
if confirm != index_name:
print(" ❌ Confirmation failed. Removal cancelled.")
return False
if kind == "cli":
return self._delete_index_directory(
selected_match["index_dir"],
index_name,
project_path if not is_current else None,
is_app=False,
) )
else: print(f" Size: {size_mb:.1f} MB")
return self._delete_index_directory(
selected_match["files_dir"],
selected_match.get("display_name", index_name),
project_path if not is_current else None,
is_app=True,
meta_file=selected_match.get("meta_file"),
app_file_base=selected_match.get("file_base"),
)
else:
print(" ❌ Invalid choice. Removal cancelled.")
return False
except (ValueError, KeyboardInterrupt): if total_indexes > 0:
print("\n ❌ Invalid input. Removal cancelled.") print(f"\nTotal: {total_indexes} indexes across {len(valid_projects)} projects")
return False print("\nUsage (current project only):")
def _delete_index_directory( # Show example from current project
self, current_indexes_dir = current_dir / ".leann" / "indexes"
index_dir: Path, if current_indexes_dir.exists():
index_display_name: str, current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
project_path: Optional[Path] = None, if current_index_dirs:
is_app: bool = False, example_name = current_index_dirs[0].name
meta_file: Optional[Path] = None, print(f' leann search {example_name} "your query"')
app_file_base: Optional[str] = None, print(f" leann ask {example_name} --interactive")
):
"""Delete a CLI index directory or APP index files safely."""
try:
if is_app:
removed = 0
errors = 0
# Delete only files that belong to this app index (based on file base)
pattern_base = app_file_base or ""
for f in index_dir.glob(f"{pattern_base}.leann*"):
try:
f.unlink()
removed += 1
except Exception:
errors += 1
# Best-effort: also remove the meta file if specified and still exists
if meta_file and meta_file.exists():
try:
meta_file.unlink()
removed += 1
except Exception:
errors += 1
if removed > 0 and errors == 0:
if project_path:
print(
f"✅ App index '{index_display_name}' removed from {project_path.name}"
)
else:
print(f"✅ App index '{index_display_name}' removed successfully")
return True
elif removed > 0 and errors > 0:
print(
f"⚠️ App index '{index_display_name}' partially removed (some files couldn't be deleted)"
)
return True
else:
print(
f"❌ No files found to remove for app index '{index_display_name}' in {index_dir}"
)
return False
else:
import shutil
shutil.rmtree(index_dir)
if project_path:
print(f"✅ Index '{index_display_name}' removed from {project_path.name}")
else:
print(f"✅ Index '{index_display_name}' removed successfully")
return True
except Exception as e:
print(f"❌ Error removing index '{index_display_name}': {e}")
return False
def load_documents( def load_documents(
self, self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
docs_paths: Union[str, list],
custom_file_types: Union[str, None] = None,
include_hidden: bool = False,
): ):
# Handle both single path (string) and multiple paths (list) for backward compatibility # Handle both single path (string) and multiple paths (list) for backward compatibility
if isinstance(docs_paths, str): if isinstance(docs_paths, str):
@@ -876,10 +431,6 @@ Examples:
all_documents = [] all_documents = []
# Helper to detect hidden path components
def _path_has_hidden_segment(p: Path) -> bool:
return any(part.startswith(".") and part not in [".", ".."] for part in p.parts)
# First, process individual files if any # First, process individual files if any
if files: if files:
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...") print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
@@ -892,12 +443,8 @@ Examples:
files_by_dir = defaultdict(list) files_by_dir = defaultdict(list)
for file_path in files: for file_path in files:
file_path_obj = Path(file_path) parent_dir = str(Path(file_path).parent)
if not include_hidden and _path_has_hidden_segment(file_path_obj): files_by_dir[parent_dir].append(file_path)
print(f" ⚠️ Skipping hidden file: {file_path}")
continue
parent_dir = str(file_path_obj.parent)
files_by_dir[parent_dir].append(str(file_path_obj))
# Load files from each parent directory # Load files from each parent directory
for parent_dir, file_list in files_by_dir.items(): for parent_dir, file_list in files_by_dir.items():
@@ -908,7 +455,6 @@ Examples:
file_docs = SimpleDirectoryReader( file_docs = SimpleDirectoryReader(
parent_dir, parent_dir,
input_files=file_list, input_files=file_list,
# exclude_hidden only affects directory scans; input_files are explicit
filename_as_id=True, filename_as_id=True,
).load_data() ).load_data()
all_documents.extend(file_docs) all_documents.extend(file_docs)
@@ -1007,8 +553,6 @@ Examples:
# Check if file matches any exclude pattern # Check if file matches any exclude pattern
try: try:
relative_path = file_path.relative_to(docs_path) relative_path = file_path.relative_to(docs_path)
if not include_hidden and _path_has_hidden_segment(relative_path):
continue
if self._should_exclude_file(relative_path, gitignore_matches): if self._should_exclude_file(relative_path, gitignore_matches):
continue continue
except ValueError: except ValueError:
@@ -1036,7 +580,6 @@ Examples:
try: try:
default_docs = SimpleDirectoryReader( default_docs = SimpleDirectoryReader(
str(file_path.parent), str(file_path.parent),
exclude_hidden=not include_hidden,
filename_as_id=True, filename_as_id=True,
required_exts=[file_path.suffix], required_exts=[file_path.suffix],
).load_data() ).load_data()
@@ -1065,7 +608,6 @@ Examples:
encoding="utf-8", encoding="utf-8",
required_exts=code_extensions, required_exts=code_extensions,
file_extractor={}, # Use default extractors file_extractor={}, # Use default extractors
exclude_hidden=not include_hidden,
filename_as_id=True, filename_as_id=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
@@ -1184,40 +726,7 @@ Examples:
print(f"Index '{index_name}' already exists. Use --force to rebuild.") print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return return
# Configure chunking based on CLI args before loading documents all_texts = self.load_documents(docs_paths, args.file_types)
# Guard against invalid configurations
doc_chunk_size = max(1, int(args.doc_chunk_size))
doc_chunk_overlap = max(0, int(args.doc_chunk_overlap))
if doc_chunk_overlap >= doc_chunk_size:
print(
f"⚠️ Adjusting doc chunk overlap from {doc_chunk_overlap} to {doc_chunk_size - 1} (must be < chunk size)"
)
doc_chunk_overlap = doc_chunk_size - 1
code_chunk_size = max(1, int(args.code_chunk_size))
code_chunk_overlap = max(0, int(args.code_chunk_overlap))
if code_chunk_overlap >= code_chunk_size:
print(
f"⚠️ Adjusting code chunk overlap from {code_chunk_overlap} to {code_chunk_size - 1} (must be < chunk size)"
)
code_chunk_overlap = code_chunk_size - 1
self.node_parser = SentenceSplitter(
chunk_size=doc_chunk_size,
chunk_overlap=doc_chunk_overlap,
separator=" ",
paragraph_separator="\n\n",
)
self.code_parser = SentenceSplitter(
chunk_size=code_chunk_size,
chunk_overlap=code_chunk_overlap,
separator="\n",
paragraph_separator="\n\n",
)
all_texts = self.load_documents(
docs_paths, args.file_types, include_hidden=args.include_hidden
)
if not all_texts: if not all_texts:
print("No documents found") print("No documents found")
return return
@@ -1249,101 +758,13 @@ Examples:
async def search_documents(self, args): async def search_documents(self, args):
index_name = args.index_name index_name = args.index_name
query = args.query query = args.query
# First try to find the index in current project
index_path = self.get_index_path(index_name) index_path = self.get_index_path(index_name)
if self.index_exists(index_name):
# Found in current project, use it
pass
else:
# Search across all registered projects (like list_indexes does)
all_matches = self._find_all_matching_indexes(index_name)
if not all_matches:
print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
)
return
elif len(all_matches) == 1:
# Found exactly one match, use it
match = all_matches[0]
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
# App format: use the meta file to construct the path
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
project_info = ( if not self.index_exists(index_name):
"current project" print(
if match["is_current"] f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
else f"project '{match['project_path'].name}'" )
) return
print(f"Using index '{index_name}' from {project_info}")
else:
# Multiple matches found
if args.non_interactive:
# Non-interactive mode: automatically select the best match
# Priority: current project first, then first available
current_matches = [m for m in all_matches if m["is_current"]]
if current_matches:
match = current_matches[0]
location_desc = "current project"
else:
match = all_matches[0]
location_desc = f"project '{match['project_path'].name}'"
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
print(
f"Found {len(all_matches)} indexes named '{index_name}', using index from {location_desc}"
)
else:
# Interactive mode: ask user to choose
print(f"Found {len(all_matches)} indexes named '{index_name}':")
for i, match in enumerate(all_matches, 1):
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
print(
f" {i}. 🏠 Current project ({'CLI' if kind == 'cli' else 'APP'})"
)
else:
print(
f" {i}. 📂 {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})"
)
try:
choice = input(f"Which index to search? (1-{len(all_matches)}): ").strip()
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(all_matches):
match = all_matches[choice_idx]
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
project_info = (
"current project"
if match["is_current"]
else f"project '{match['project_path'].name}'"
)
print(f"Using index '{index_name}' from {project_info}")
else:
print("Invalid choice. Aborting search.")
return
except (ValueError, KeyboardInterrupt):
print("Invalid input. Aborting search.")
return
searcher = LeannSearcher(index_path=index_path) searcher = LeannSearcher(index_path=index_path)
results = searcher.search( results = searcher.search(
@@ -1442,8 +863,6 @@ Examples:
if args.command == "list": if args.command == "list":
self.list_indexes() self.list_indexes()
elif args.command == "remove":
self.remove_index(args.index_name, args.force)
elif args.command == "build": elif args.command == "build":
await self.build_index(args) await self.build_index(args)
elif args.command == "search": elif args.command == "search":
@@ -1455,15 +874,10 @@ Examples:
def main(): def main():
import logging
import dotenv import dotenv
dotenv.load_dotenv() dotenv.load_dotenv()
# Set clean logging for CLI usage
logging.getLogger().setLevel(logging.WARNING) # Only show warnings and errors
cli = LeannCLI() cli = LeannCLI()
asyncio.run(cli.run()) asyncio.run(cli.run())

View File

@@ -57,8 +57,6 @@ def compute_embeddings(
return compute_embeddings_mlx(texts, model_name) return compute_embeddings_mlx(texts, model_name)
elif mode == "ollama": elif mode == "ollama":
return compute_embeddings_ollama(texts, model_name, is_build=is_build) return compute_embeddings_ollama(texts, model_name, is_build=is_build)
elif mode == "gemini":
return compute_embeddings_gemini(texts, model_name, is_build=is_build)
else: else:
raise ValueError(f"Unsupported embedding mode: {mode}") raise ValueError(f"Unsupported embedding mode: {mode}")
@@ -246,16 +244,6 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
except ImportError as e: except ImportError as e:
raise ImportError(f"OpenAI package not installed: {e}") raise ImportError(f"OpenAI package not installed: {e}")
# Validate input list
if not texts:
raise ValueError("Cannot compute embeddings for empty text list")
# Extra validation: abort early if any item is empty/whitespace
invalid_count = sum(1 for t in texts if not isinstance(t, str) or not t.strip())
if invalid_count > 0:
raise ValueError(
f"Found {invalid_count} empty/invalid text(s) in input. Upstream should filter before calling OpenAI."
)
api_key = os.getenv("OPENAI_API_KEY") api_key = os.getenv("OPENAI_API_KEY")
if not api_key: if not api_key:
raise RuntimeError("OPENAI_API_KEY environment variable not set") raise RuntimeError("OPENAI_API_KEY environment variable not set")
@@ -275,16 +263,8 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
print(f"len of texts: {len(texts)}") print(f"len of texts: {len(texts)}")
# OpenAI has limits on batch size and input length # OpenAI has limits on batch size and input length
max_batch_size = 800 # Conservative batch size because the token limit is 300K max_batch_size = 1000 # Conservative batch size
all_embeddings = [] all_embeddings = []
# get the avg len of texts
avg_len = sum(len(text) for text in texts) / len(texts)
print(f"avg len of texts: {avg_len}")
# if avg len is less than 1000, use the max batch size
if avg_len > 300:
max_batch_size = 500
# if avg len is less than 1000, use the max batch size
try: try:
from tqdm import tqdm from tqdm import tqdm
@@ -670,83 +650,3 @@ def compute_embeddings_ollama(
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}") logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
return embeddings return embeddings
def compute_embeddings_gemini(
texts: list[str], model_name: str = "text-embedding-004", is_build: bool = False
) -> np.ndarray:
"""
Compute embeddings using Google Gemini API.
Args:
texts: List of texts to compute embeddings for
model_name: Gemini model name (default: "text-embedding-004")
is_build: Whether this is a build operation (shows progress bar)
Returns:
Embeddings array, shape: (len(texts), embedding_dim)
"""
try:
import os
import google.genai as genai
except ImportError as e:
raise ImportError(f"Google GenAI package not installed: {e}")
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise RuntimeError("GEMINI_API_KEY environment variable not set")
# Cache Gemini client
cache_key = "gemini_client"
if cache_key in _model_cache:
client = _model_cache[cache_key]
else:
client = genai.Client(api_key=api_key)
_model_cache[cache_key] = client
logger.info("Gemini client cached")
logger.info(
f"Computing embeddings for {len(texts)} texts using Gemini API, model: '{model_name}'"
)
# Gemini supports batch embedding
max_batch_size = 100 # Conservative batch size for Gemini
all_embeddings = []
try:
from tqdm import tqdm
total_batches = (len(texts) + max_batch_size - 1) // max_batch_size
batch_range = range(0, len(texts), max_batch_size)
batch_iterator = tqdm(
batch_range, desc="Computing embeddings", unit="batch", total=total_batches
)
except ImportError:
# Fallback when tqdm is not available
batch_iterator = range(0, len(texts), max_batch_size)
for i in batch_iterator:
batch_texts = texts[i : i + max_batch_size]
try:
# Use the embed_content method from the new Google GenAI SDK
response = client.models.embed_content(
model=model_name,
contents=batch_texts,
config=genai.types.EmbedContentConfig(
task_type="RETRIEVAL_DOCUMENT" # For document embedding
),
)
# Extract embeddings from response
for embedding_data in response.embeddings:
all_embeddings.append(embedding_data.values)
except Exception as e:
logger.error(f"Batch {i} failed: {e}")
raise
embeddings = np.array(all_embeddings, dtype=np.float32)
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
return embeddings

View File

@@ -64,6 +64,19 @@ def handle_request(request):
"required": ["index_name", "query"], "required": ["index_name", "query"],
}, },
}, },
{
"name": "leann_status",
"description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!",
"inputSchema": {
"type": "object",
"properties": {
"index_name": {
"type": "string",
"description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.",
}
},
},
},
{ {
"name": "leann_list", "name": "leann_list",
"description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.", "description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.",
@@ -94,7 +107,7 @@ def handle_request(request):
}, },
} }
# Build simplified command with non-interactive flag for MCP compatibility # Build simplified command
cmd = [ cmd = [
"leann", "leann",
"search", "search",
@@ -102,10 +115,18 @@ def handle_request(request):
args["query"], args["query"],
f"--top-k={args.get('top_k', 5)}", f"--top-k={args.get('top_k', 5)}",
f"--complexity={args.get('complexity', 32)}", f"--complexity={args.get('complexity', 32)}",
"--non-interactive",
] ]
result = subprocess.run(cmd, capture_output=True, text=True) result = subprocess.run(cmd, capture_output=True, text=True)
elif tool_name == "leann_status":
if args.get("index_name"):
# Check specific index status - for now, we'll use leann list and filter
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
# We could enhance this to show more detailed status per index
else:
# Show all indexes status
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
elif tool_name == "leann_list": elif tool_name == "leann_list":
result = subprocess.run(["leann", "list"], capture_output=True, text=True) result = subprocess.run(["leann", "list"], capture_output=True, text=True)

View File

@@ -2,17 +2,11 @@
import importlib import importlib
import importlib.metadata import importlib.metadata
import json from typing import TYPE_CHECKING
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING: if TYPE_CHECKING:
from leann.interface import LeannBackendFactoryInterface from leann.interface import LeannBackendFactoryInterface
# Set up logger for this module
logger = logging.getLogger(__name__)
BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {} BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {}
@@ -20,7 +14,7 @@ def register_backend(name: str):
"""A decorator to register a new backend class.""" """A decorator to register a new backend class."""
def decorator(cls): def decorator(cls):
logger.debug(f"Registering backend '{name}'") print(f"INFO: Registering backend '{name}'")
BACKEND_REGISTRY[name] = cls BACKEND_REGISTRY[name] = cls
return cls return cls
@@ -45,54 +39,3 @@ def autodiscover_backends():
# print(f"WARN: Could not import backend module '{backend_module_name}': {e}") # print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
pass pass
# print("INFO: Backend auto-discovery finished.") # print("INFO: Backend auto-discovery finished.")
def register_project_directory(project_dir: Optional[Union[str, Path]] = None):
"""
Register a project directory in the global LEANN registry.
This allows `leann list` to discover indexes created by apps or other tools.
Args:
project_dir: Directory to register. If None, uses current working directory.
"""
if project_dir is None:
project_dir = Path.cwd()
else:
project_dir = Path(project_dir)
# Only register directories that have some kind of LEANN content
# Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format)
has_cli_indexes = (project_dir / ".leann" / "indexes").exists()
has_app_indexes = any(project_dir.rglob("*.leann.meta.json"))
if not (has_cli_indexes or has_app_indexes):
# Don't register if there are no LEANN indexes
return
global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
project_str = str(project_dir.resolve())
# Load existing registry
projects = []
if global_registry.exists():
try:
with open(global_registry) as f:
projects = json.load(f)
except Exception:
logger.debug("Could not load existing project registry")
projects = []
# Add project if not already present
if project_str not in projects:
projects.append(project_str)
# Save updated registry
try:
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
logger.debug(f"Registered project directory: {project_str}")
except Exception as e:
logger.warning(f"Could not save project registry: {e}")

View File

@@ -13,20 +13,10 @@ This installs the `leann` CLI into an isolated tool environment and includes bot
## 🚀 Quick Setup ## 🚀 Quick Setup
Add the LEANN MCP server to Claude Code. Choose the scope based on how widely you want it available. Below is the command to install it globally; if you prefer a local install, skip this step: Add the LEANN MCP server to Claude Code:
```bash ```bash
# Global (recommended): available in all projects for your user claude mcp add leann-server -- leann_mcp
claude mcp add --scope user leann-server -- leann_mcp
```
- `leann-server`: the display name of the MCP server in Claude Code (you can change it).
- `leann_mcp`: the Python entry point installed with LEANN that starts the MCP server.
Verify it is registered globally:
```bash
claude mcp list | cat
``` ```
## 🛠️ Available Tools ## 🛠️ Available Tools
@@ -35,36 +25,27 @@ Once connected, you'll have access to these powerful semantic search tools in Cl
- **`leann_list`** - List all available indexes across your projects - **`leann_list`** - List all available indexes across your projects
- **`leann_search`** - Perform semantic searches across code and documents - **`leann_search`** - Perform semantic searches across code and documents
- **`leann_ask`** - Ask natural language questions and get AI-powered answers from your codebase
## 🎯 Quick Start Example ## 🎯 Quick Start Example
```bash ```bash
# Add locally if you did not add it globally (current folder only; default if --scope is omitted)
claude mcp add leann-server -- leann_mcp
# Build an index for your project (change to your actual path) # Build an index for your project (change to your actual path)
# See the advanced examples below for more ways to configure indexing leann build my-project --docs ./
# Set the index name (replace 'my-project' with your own)
leann build my-project --docs $(git ls-files)
# Start Claude Code # Start Claude Code
claude claude
``` ```
## 🚀 Advanced Usage Examples to build the index ## 🚀 Advanced Usage Examples
### Index Entire Git Repository ### Index Entire Git Repository
```bash ```bash
# Index all tracked files in your Git repository. # Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
# Note: submodules are currently skipped; we can add them back if needed.
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Index only tracked Python files from Git. # Index only specific file types from git
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# If you encounter empty requests caused by empty files (e.g., __init__.py), exclude zero-byte files. Thanks @ww2283 for pointing [that](https://github.com/yichuan-w/LEANN/issues/48) out
leann build leann-prospec-lig --docs $(find ./src -name "*.py" -not -empty) --embedding-mode openai --embedding-model text-embedding-3-small
``` ```
### Multiple Directories and Files ### Multiple Directories and Files
@@ -92,7 +73,7 @@ leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.js
``` ```
## **Try this in Claude Code:** **Try this in Claude Code:**
``` ```
Help me understand this codebase. List available indexes and search for authentication patterns. Help me understand this codebase. List available indexes and search for authentication patterns.
``` ```
@@ -101,7 +82,6 @@ Help me understand this codebase. List available indexes and search for authenti
<img src="../../assets/claude_code_leann.png" alt="LEANN in Claude Code" width="80%"> <img src="../../assets/claude_code_leann.png" alt="LEANN in Claude Code" width="80%">
</p> </p>
If you see a prompt asking whether to proceed with LEANN, you can now use it in your chat!
## 🧠 How It Works ## 🧠 How It Works
@@ -137,11 +117,3 @@ To remove LEANN
``` ```
uv pip uninstall leann leann-backend-hnsw leann-core uv pip uninstall leann leann-backend-hnsw leann-core
``` ```
To globally remove LEANN (for version update)
```
uv tool list | cat
uv tool uninstall leann-core
command -v leann || echo "leann gone"
command -v leann_mcp || echo "leann_mcp gone"
```

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.3.1" version = "0.2.9"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -1 +0,0 @@
__all__ = []

View File

@@ -136,9 +136,5 @@ def export_sqlite(
connection.commit() connection.commit()
def main():
app()
if __name__ == "__main__": if __name__ == "__main__":
main() app()

View File

@@ -10,10 +10,11 @@ requires-python = ">=3.9"
dependencies = [ dependencies = [
"leann-core", "leann-core",
"leann-backend-hnsw", "leann-backend-hnsw",
"typer>=0.12.3",
"numpy>=1.26.0", "numpy>=1.26.0",
"torch", "torch",
"tqdm", "tqdm",
"flask",
"flask_compress",
"datasets>=2.15.0", "datasets>=2.15.0",
"evaluate", "evaluate",
"colorama", "colorama",
@@ -64,7 +65,9 @@ test = [
"pytest>=7.0", "pytest>=7.0",
"pytest-timeout>=2.0", "pytest-timeout>=2.0",
"llama-index-core>=0.12.0", "llama-index-core>=0.12.0",
"llama-index-readers-file>=0.4.0",
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"sentence-transformers>=2.2.0",
] ]
diskann = [ diskann = [
@@ -81,11 +84,6 @@ documents = [
[tool.setuptools] [tool.setuptools]
py-modules = [] py-modules = []
packages = ["wechat_exporter"]
package-dir = { "wechat_exporter" = "packages/wechat-exporter" }
[project.scripts]
wechat-exporter = "wechat_exporter.main:main"
[tool.uv.sources] [tool.uv.sources]
@@ -96,8 +94,13 @@ leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
[tool.ruff] [tool.ruff]
target-version = "py39" target-version = "py39"
line-length = 100 line-length = 100
extend-exclude = ["third_party"] extend-exclude = [
"third_party",
"*.egg-info",
"__pycache__",
".git",
".venv",
]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
@@ -120,12 +123,21 @@ ignore = [
"RUF012", # mutable class attributes should be annotated with typing.ClassVar "RUF012", # mutable class attributes should be annotated with typing.ClassVar
] ]
[tool.ruff.lint.per-file-ignores]
"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests)
"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples)
[tool.ruff.format] [tool.ruff.format]
quote-style = "double" quote-style = "double"
indent-style = "space" indent-style = "space"
skip-magic-trailing-comma = false skip-magic-trailing-comma = false
line-ending = "auto" line-ending = "auto"
[dependency-groups]
dev = [
"ruff>=0.12.4",
]
[tool.lychee] [tool.lychee]
accept = ["200", "403", "429", "503"] accept = ["200", "403", "429", "503"]
timeout = 20 timeout = 20

7313
uv.lock generated
View File

File diff suppressed because it is too large Load Diff