Compare commits

..

1 Commits

Author SHA1 Message Date
Andy Lee
f1aca0f756 fix(core): skip empty/invalid chunks before embedding; guard OpenAI embeddings
Avoid 400 errors from OpenAI when chunker yields empty strings by filtering
invalid texts in LeannBuilder.build_index. Add validation fail-fast in
OpenAI embedding path to surface upstream issues earlier. Keeps passages and
embeddings aligned during build.

Refs #54
2025-08-15 17:28:40 -07:00
22 changed files with 3884 additions and 4541 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
paper_plot/data/big_graph_degree_data.npz filter=lfs diff=lfs merge=lfs -text

View File

@@ -87,7 +87,7 @@ jobs:
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v5 - uses: actions/checkout@v4
with: with:
ref: ${{ inputs.ref }} ref: ${{ inputs.ref }}
submodules: recursive submodules: recursive
@@ -98,23 +98,21 @@ jobs:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@v6 uses: astral-sh/setup-uv@v4
- name: Install system dependencies (Ubuntu) - name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux' if: runner.os == 'Linux'
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \ sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
pkg-config libabsl-dev libaio-dev libprotobuf-dev \ pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
patchelf
# Install Intel MKL for DiskANN # Install Intel MKL for DiskANN
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
- name: Install system dependencies (macOS) - name: Install system dependencies (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'
@@ -306,53 +304,3 @@ jobs:
with: with:
name: packages-${{ matrix.os }}-py${{ matrix.python }} name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/ path: packages/*/dist/
arch-smoke:
name: Arch Linux smoke test (install & import)
needs: build
runs-on: ubuntu-latest
container:
image: archlinux:latest
steps:
- name: Prepare system
run: |
pacman -Syu --noconfirm
pacman -S --noconfirm python python-pip gcc git zlib openssl
- name: Download ALL wheel artifacts from this run
uses: actions/download-artifact@v5
with:
# Don't specify name, download all artifacts
path: ./wheels
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Create virtual environment and install wheels
run: |
uv venv
source .venv/bin/activate || source .venv/Scripts/activate
uv pip install --find-links wheels leann-core
uv pip install --find-links wheels leann-backend-hnsw
uv pip install --find-links wheels leann-backend-diskann
uv pip install --find-links wheels leann
- name: Import & tiny runtime check
env:
OMP_NUM_THREADS: 1
MKL_NUM_THREADS: 1
run: |
source .venv/bin/activate || source .venv/Scripts/activate
python - <<'PY'
import leann
import leann_backend_hnsw as h
import leann_backend_diskann as d
from leann import LeannBuilder, LeannSearcher
b = LeannBuilder(backend_name="hnsw")
b.add_text("hello arch")
b.build_index("arch_demo.leann")
s = LeannSearcher("arch_demo.leann")
print("search:", s.search("hello", top_k=1))
PY

View File

@@ -14,6 +14,6 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: lycheeverse/lychee-action@v2 - uses: lycheeverse/lychee-action@v2
with: with:
args: --no-progress --insecure --user-agent 'curl/7.68.0' README.md docs/ apps/ examples/ benchmarks/ args: --no-progress --insecure README.md docs/ apps/ examples/ benchmarks/
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

1
.gitignore vendored
View File

@@ -18,7 +18,6 @@ demo/experiment_results/**/*.json
*.eml *.eml
*.emlx *.emlx
*.json *.json
!.vscode/*.json
*.sh *.sh
*.txt *.txt
!CMakeLists.txt !CMakeLists.txt

View File

@@ -1,5 +0,0 @@
{
"recommendations": [
"charliermarsh.ruff",
]
}

22
.vscode/settings.json vendored
View File

@@ -1,22 +0,0 @@
{
"python.defaultInterpreterPath": ".venv/bin/python",
"python.terminal.activateEnvironment": true,
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit"
},
"editor.insertSpaces": true,
"editor.tabSize": 4
},
"ruff.enable": true,
"files.watcherExclude": {
"**/.venv/**": true,
"**/__pycache__/**": true,
"**/*.egg-info/**": true,
"**/build/**": true,
"**/dist/**": true
}
}

View File

@@ -5,7 +5,7 @@
<p align="center"> <p align="center">
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions"> <img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status"> <img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%26%20Arch%20%26%20WSL%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform"> <img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License"> <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration"> <img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
</p> </p>
@@ -87,60 +87,15 @@ git submodule update --init --recursive
``` ```
**macOS:** **macOS:**
Note: DiskANN requires MacOS 13.3 or later.
```bash ```bash
brew install libomp boost protobuf zeromq pkgconf brew install llvm libomp boost protobuf zeromq pkgconf
uv sync --extra diskann CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv sync
``` ```
**Linux (Ubuntu/Debian):** **Linux:**
Note: On Ubuntu 20.04, you may need to build a newer Abseil and pin Protobuf (e.g., v3.20.x) for building DiskANN. See [Issue #30](https://github.com/yichuan-w/LEANN/issues/30) for a step-by-step note.
You can manually install [Intel oneAPI MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) instead of `libmkl-full-dev` for DiskANN. You can also use `libopenblas-dev` for building HNSW only, by removing `--extra diskann` in the command below.
```bash ```bash
sudo apt-get update && sudo apt-get install -y \ sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev libmkl-full-dev libaio-dev libzmq3-dev
libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \ uv sync
pkg-config libabsl-dev libaio-dev libprotobuf-dev \
libmkl-full-dev
uv sync --extra diskann
```
**Linux (Arch Linux):**
```bash
sudo pacman -Syu && sudo pacman -S --needed base-devel cmake pkgconf git gcc \
boost boost-libs protobuf abseil-cpp libaio zeromq
# For MKL in DiskANN
sudo pacman -S --needed base-devel git
git clone https://aur.archlinux.org/paru-bin.git
cd paru-bin && makepkg -si
paru -S intel-oneapi-mkl intel-oneapi-compiler
source /opt/intel/oneapi/setvars.sh
uv sync --extra diskann
```
**Linux (RHEL / CentOS Stream / Oracle / Rocky / AlmaLinux):**
See [Issue #50](https://github.com/yichuan-w/LEANN/issues/50) for more details.
```bash
sudo dnf groupinstall -y "Development Tools"
sudo dnf install -y libomp-devel boost-devel protobuf-compiler protobuf-devel \
abseil-cpp-devel libaio-devel zeromq-devel pkgconf-pkg-config
# For MKL in DiskANN
sudo dnf install -y intel-oneapi-mkl intel-oneapi-mkl-devel \
intel-oneapi-openmp || sudo dnf install -y intel-oneapi-compiler
source /opt/intel/oneapi/setvars.sh
uv sync --extra diskann
``` ```
</details> </details>
@@ -527,9 +482,6 @@ leann ask my-docs --interactive
# List all your indexes # List all your indexes
leann list leann list
# Remove an index
leann remove my-docs
``` ```
**Key CLI features:** **Key CLI features:**
@@ -542,7 +494,7 @@ leann remove my-docs
<details> <details>
<summary><strong>📋 Click to expand: Complete CLI Reference</strong></summary> <summary><strong>📋 Click to expand: Complete CLI Reference</strong></summary>
You can use `leann --help`, or `leann build --help`, `leann search --help`, `leann ask --help`, `leann list --help`, `leann remove --help` to get the complete CLI reference. You can use `leann --help`, or `leann build --help`, `leann search --help`, `leann ask --help` to get the complete CLI reference.
**Build Command:** **Build Command:**
```bash ```bash
@@ -580,31 +532,6 @@ Options:
--top-k N Retrieval count (default: 20) --top-k N Retrieval count (default: 20)
``` ```
**List Command:**
```bash
leann list
# Lists all indexes across all projects with status indicators:
# ✅ - Index is complete and ready to use
# ❌ - Index is incomplete or corrupted
# 📁 - CLI-created index (in .leann/indexes/)
# 📄 - App-created index (*.leann.meta.json files)
```
**Remove Command:**
```bash
leann remove INDEX_NAME [OPTIONS]
Options:
--force, -f Force removal without confirmation
# Smart removal: automatically finds and safely removes indexes
# - Shows all matching indexes across projects
# - Requires confirmation for cross-project removal
# - Interactive selection when multiple matches found
# - Supports both CLI and app-created indexes
```
</details> </details>
## 🏗️ Architecture & How It Works ## 🏗️ Architecture & How It Works

View File

@@ -10,7 +10,6 @@ from typing import Any
import dotenv import dotenv
from leann.api import LeannBuilder, LeannChat from leann.api import LeannBuilder, LeannChat
from leann.registry import register_project_directory
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv() dotenv.load_dotenv()
@@ -215,11 +214,6 @@ class BaseRAGExample(ABC):
builder.build_index(index_path) builder.build_index(index_path)
print(f"Index saved to: {index_path}") print(f"Index saved to: {index_path}")
# Register project directory so leann list can discover this index
# The index is saved as args.index_dir/index_name.leann
# We want to register the current working directory where the app is run
register_project_directory(Path.cwd())
return index_path return index_path
async def run_interactive_chat(self, args, index_path: str): async def run_interactive_chat(self, args, index_path: str):

View File

@@ -183,9 +183,6 @@ class Benchmark:
start_time = time.time() start_time = time.time()
with torch.no_grad(): with torch.no_grad():
self.model(input_ids=input_ids, attention_mask=attention_mask) self.model(input_ids=input_ids, attention_mask=attention_mask)
# mps sync
if torch.backends.mps.is_available():
torch.mps.synchronize()
end_time = time.time() end_time = time.time()
return end_time - start_time return end_time - start_time

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.3.2" version = "0.2.9"
dependencies = ["leann-core==0.3.2", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.2.9", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.3.2" version = "0.2.9"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.3.2", "leann-core==0.2.9",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.3.2" version = "0.2.9"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -46,7 +46,6 @@ def compute_embeddings(
- "sentence-transformers": Use sentence-transformers library (default) - "sentence-transformers": Use sentence-transformers library (default)
- "mlx": Use MLX backend for Apple Silicon - "mlx": Use MLX backend for Apple Silicon
- "openai": Use OpenAI embedding API - "openai": Use OpenAI embedding API
- "gemini": Use Google Gemini embedding API
use_server: Whether to use embedding server (True for search, False for build) use_server: Whether to use embedding server (True for search, False for build)
Returns: Returns:
@@ -614,7 +613,7 @@ class LeannSearcher:
zmq_port=zmq_port, zmq_port=zmq_port,
) )
# logger.info(f" Generated embedding shape: {query_embedding.shape}") # logger.info(f" Generated embedding shape: {query_embedding.shape}")
# time.time() - start_time time.time() - start_time
# logger.info(f" Embedding time: {embedding_time} seconds") # logger.info(f" Embedding time: {embedding_time} seconds")
start_time = time.time() start_time = time.time()
@@ -680,9 +679,8 @@ class LeannSearcher:
This method should be called after you're done using the searcher, This method should be called after you're done using the searcher,
especially in test environments or batch processing scenarios. especially in test environments or batch processing scenarios.
""" """
backend = getattr(self.backend_impl, "embedding_server_manager", None) if hasattr(self.backend_impl, "embedding_server_manager"):
if backend is not None: self.backend_impl.embedding_server_manager.stop_server()
backend.stop_server()
# Enable automatic cleanup patterns # Enable automatic cleanup patterns
def __enter__(self): def __enter__(self):

View File

@@ -680,60 +680,6 @@ class HFChat(LLMInterface):
return response.strip() return response.strip()
class GeminiChat(LLMInterface):
"""LLM interface for Google Gemini models."""
def __init__(self, model: str = "gemini-2.5-flash", api_key: Optional[str] = None):
self.model = model
self.api_key = api_key or os.getenv("GEMINI_API_KEY")
if not self.api_key:
raise ValueError(
"Gemini API key is required. Set GEMINI_API_KEY environment variable or pass api_key parameter."
)
logger.info(f"Initializing Gemini Chat with model='{model}'")
try:
import google.genai as genai
self.client = genai.Client(api_key=self.api_key)
except ImportError:
raise ImportError(
"The 'google-genai' library is required for Gemini models. Please install it with 'uv pip install google-genai'."
)
def ask(self, prompt: str, **kwargs) -> str:
logger.info(f"Sending request to Gemini with model {self.model}")
try:
from google.genai.types import GenerateContentConfig
generation_config = GenerateContentConfig(
temperature=kwargs.get("temperature", 0.7),
max_output_tokens=kwargs.get("max_tokens", 1000),
)
# Handle top_p parameter
if "top_p" in kwargs:
generation_config.top_p = kwargs["top_p"]
response = self.client.models.generate_content(
model=self.model,
contents=prompt,
config=generation_config,
)
# Handle potential None response text
response_text = response.text
if response_text is None:
logger.warning("Gemini returned None response text")
return ""
return response_text.strip()
except Exception as e:
logger.error(f"Error communicating with Gemini: {e}")
return f"Error: Could not get a response from Gemini. Details: {e}"
class OpenAIChat(LLMInterface): class OpenAIChat(LLMInterface):
"""LLM interface for OpenAI models.""" """LLM interface for OpenAI models."""
@@ -847,8 +793,6 @@ def get_llm(llm_config: Optional[dict[str, Any]] = None) -> LLMInterface:
return HFChat(model_name=model or "deepseek-ai/deepseek-llm-7b-chat") return HFChat(model_name=model or "deepseek-ai/deepseek-llm-7b-chat")
elif llm_type == "openai": elif llm_type == "openai":
return OpenAIChat(model=model or "gpt-4o", api_key=llm_config.get("api_key")) return OpenAIChat(model=model or "gpt-4o", api_key=llm_config.get("api_key"))
elif llm_type == "gemini":
return GeminiChat(model=model or "gemini-2.5-flash", api_key=llm_config.get("api_key"))
elif llm_type == "simulated": elif llm_type == "simulated":
return SimulatedChat() return SimulatedChat()
else: else:

View File

@@ -1,14 +1,13 @@
import argparse import argparse
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Union
from llama_index.core import SimpleDirectoryReader from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
from tqdm import tqdm from tqdm import tqdm
from .api import LeannBuilder, LeannChat, LeannSearcher from .api import LeannBuilder, LeannChat, LeannSearcher
from .registry import register_project_directory
def extract_pdf_text_with_pymupdf(file_path: str) -> str: def extract_pdf_text_with_pymupdf(file_path: str) -> str:
@@ -85,7 +84,6 @@ Examples:
leann search my-docs "query" # Search in my-docs index leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index leann ask my-docs "question" # Ask my-docs index
leann list # List all stored indexes leann list # List all stored indexes
leann remove my-docs # Remove an index (local first, then global)
""", """,
) )
@@ -150,12 +148,6 @@ Examples:
type=str, type=str,
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.", help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
) )
build_parser.add_argument(
"--include-hidden",
action=argparse.BooleanOptionalAction,
default=False,
help="Include hidden files and directories (paths starting with '.') during indexing (default: false)",
)
build_parser.add_argument( build_parser.add_argument(
"--doc-chunk-size", "--doc-chunk-size",
type=int, type=int,
@@ -206,11 +198,6 @@ Examples:
default="global", default="global",
help="Pruning strategy (default: global)", help="Pruning strategy (default: global)",
) )
search_parser.add_argument(
"--non-interactive",
action="store_true",
help="Non-interactive mode: automatically select index without prompting",
)
# Ask command # Ask command
ask_parser = subparsers.add_parser("ask", help="Ask questions") ask_parser = subparsers.add_parser("ask", help="Ask questions")
@@ -258,18 +245,35 @@ Examples:
# List command # List command
subparsers.add_parser("list", help="List all indexes") subparsers.add_parser("list", help="List all indexes")
# Remove command
remove_parser = subparsers.add_parser("remove", help="Remove an index")
remove_parser.add_argument("index_name", help="Index name to remove")
remove_parser.add_argument(
"--force", "-f", action="store_true", help="Force removal without confirmation"
)
return parser return parser
def register_project_dir(self): def register_project_dir(self):
"""Register current project directory in global registry""" """Register current project directory in global registry"""
register_project_directory() global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
current_dir = str(Path.cwd())
# Load existing registry
projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
projects = json.load(f)
except Exception:
projects = []
# Add current directory if not already present
if current_dir not in projects:
projects.append(current_dir)
# Save registry
import json
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
def _build_gitignore_parser(self, docs_dir: str): def _build_gitignore_parser(self, docs_dir: str):
"""Build gitignore parser using gitignore-parser library.""" """Build gitignore parser using gitignore-parser library."""
@@ -329,6 +333,8 @@ Examples:
return False return False
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:")
# Get all project directories with .leann # Get all project directories with .leann
global_registry = Path.home() / ".leann" / "projects.json" global_registry = Path.home() / ".leann" / "projects.json"
all_projects = [] all_projects = []
@@ -354,485 +360,58 @@ Examples:
if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects: if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects:
valid_projects.append(current_path) valid_projects.append(current_path)
# Separate current and other projects if not valid_projects:
other_projects = [] print(
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
for project_path in valid_projects: )
if project_path != current_path: return
other_projects.append(project_path)
print("📚 LEANN Indexes")
print("=" * 50)
total_indexes = 0 total_indexes = 0
current_indexes_count = 0 current_dir = Path.cwd()
# Show current project first (most important) for project_path in valid_projects:
print("\n🏠 Current Project") indexes_dir = project_path / ".leann" / "indexes"
print(f" {current_path}") if not indexes_dir.exists():
print(" " + "" * 45)
current_indexes = self._discover_indexes_in_project(current_path)
if current_indexes:
for idx in current_indexes:
total_indexes += 1
current_indexes_count += 1
type_icon = "📁" if idx["type"] == "cli" else "📄"
print(f" {current_indexes_count}. {type_icon} {idx['name']} {idx['status']}")
if idx["size_mb"] > 0:
print(f" 📦 Size: {idx['size_mb']:.1f} MB")
else:
print(" 📭 No indexes in current project")
# Show other projects (reference information)
if other_projects:
print("\n\n🗂️ Other Projects")
print(" " + "" * 45)
for project_path in other_projects:
project_indexes = self._discover_indexes_in_project(project_path)
if not project_indexes:
continue
print(f"\n 📂 {project_path.name}")
print(f" {project_path}")
for idx in project_indexes:
total_indexes += 1
type_icon = "📁" if idx["type"] == "cli" else "📄"
print(f"{type_icon} {idx['name']} {idx['status']}")
if idx["size_mb"] > 0:
print(f" 📦 {idx['size_mb']:.1f} MB")
# Summary and usage info
print("\n" + "=" * 50)
if total_indexes == 0:
print("💡 Get started:")
print(" leann build my-docs --docs ./documents")
else:
# Count only projects that have at least one discoverable index
projects_count = sum(
1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0
)
print(f"📊 Total: {total_indexes} indexes across {projects_count} projects")
if current_indexes_count > 0:
print("\n💫 Quick start (current project):")
# Get first index from current project for example
current_indexes_dir = current_path / ".leann" / "indexes"
if current_indexes_dir.exists():
current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
if current_index_dirs:
example_name = current_index_dirs[0].name
print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive")
else:
print("\n💡 Create your first index:")
print(" leann build my-docs --docs ./documents")
def _discover_indexes_in_project(self, project_path: Path):
"""Discover all indexes in a project directory (both CLI and apps formats)"""
indexes = []
# 1. CLI format: .leann/indexes/index_name/
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists():
for index_dir in cli_indexes_dir.iterdir():
if index_dir.is_dir():
meta_file = index_dir / "documents.leann.meta.json"
status = "" if meta_file.exists() else ""
size_mb = 0
if meta_file.exists():
try:
size_mb = sum(
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
) / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": index_dir.name,
"type": "cli",
"status": status,
"size_mb": size_mb,
"path": index_dir,
}
)
# 2. Apps format: *.leann.meta.json files anywhere in the project
cli_indexes_dir = project_path / ".leann" / "indexes"
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file():
# Skip CLI-built indexes (which store meta under .leann/indexes/<name>/)
try:
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
# Use the parent directory name as the app index display name
display_name = meta_file.parent.name
# Extract file base used to store files
file_base = meta_file.name.replace(".leann.meta.json", "")
# Apps indexes are considered complete if the .leann.meta.json file exists
status = ""
# Calculate total size of all related files (use file base)
size_mb = 0
try:
index_dir = meta_file.parent
for related_file in index_dir.glob(f"{file_base}.leann*"):
size_mb += related_file.stat().st_size / (1024 * 1024)
except (OSError, PermissionError):
pass
indexes.append(
{
"name": display_name,
"type": "app",
"status": status,
"size_mb": size_mb,
"path": meta_file,
}
)
return indexes
def remove_index(self, index_name: str, force: bool = False):
"""Safely remove an index - always show all matches for transparency"""
# Always do a comprehensive search for safety
print(f"🔍 Searching for all indexes named '{index_name}'...")
all_matches = self._find_all_matching_indexes(index_name)
if not all_matches:
print(f"❌ Index '{index_name}' not found in any project.")
return False
if len(all_matches) == 1:
return self._remove_single_match(all_matches[0], index_name, force)
else:
return self._remove_from_multiple_matches(all_matches, index_name, force)
def _find_all_matching_indexes(self, index_name: str):
"""Find all indexes with the given name across all projects"""
matches = []
# Get all registered projects
global_registry = Path.home() / ".leann" / "projects.json"
all_projects = []
if global_registry.exists():
try:
import json
with open(global_registry) as f:
all_projects = json.load(f)
except Exception:
pass
# Always include current project
current_path = Path.cwd()
if str(current_path) not in all_projects:
all_projects.append(str(current_path))
# Search across all projects
for project_dir in all_projects:
project_path = Path(project_dir)
if not project_path.exists():
continue continue
# 1) CLI-format index under .leann/indexes/<name> index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()]
index_dir = project_path / ".leann" / "indexes" / index_name if not index_dirs:
if index_dir.exists(): continue
is_current = project_path == current_path
matches.append(
{
"project_path": project_path,
"index_dir": index_dir,
"is_current": is_current,
"kind": "cli",
}
)
# 2) App-format indexes # Show project header
# We support two ways of addressing apps: if project_path == current_dir:
# a) by the file base (e.g., `pdf_documents`) print(f"\n📁 Current project ({project_path}):")
# b) by the parent directory name (e.g., `new_txt`)
seen_app_meta = set()
# 2a) by file base
for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"):
if meta_file.is_file():
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
# 2b) by parent directory name
for meta_file in project_path.rglob("*.leann.meta.json"):
if meta_file.is_file() and meta_file.parent.name == index_name:
# Skip CLI-built indexes' meta under .leann/indexes
try:
cli_indexes_dir = project_path / ".leann" / "indexes"
if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents:
continue
except Exception:
pass
is_current = project_path == current_path
key = (str(project_path), str(meta_file))
if key in seen_app_meta:
continue
seen_app_meta.add(key)
matches.append(
{
"project_path": project_path,
"files_dir": meta_file.parent,
"meta_file": meta_file,
"is_current": is_current,
"kind": "app",
"display_name": meta_file.parent.name,
"file_base": meta_file.name.replace(".leann.meta.json", ""),
}
)
# Sort: current project first, then by project name
matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name))
return matches
def _remove_single_match(self, match, index_name: str, force: bool):
"""Handle removal when only one match is found"""
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
location_info = "current project"
emoji = "🏠"
else:
location_info = f"other project '{project_path.name}'"
emoji = "📂"
print(f"✅ Found 1 index named '{index_name}':")
print(f" {emoji} Location: {location_info}")
if kind == "cli":
print(f" 📍 Path: {project_path / '.leann' / 'indexes' / index_name}")
else:
print(f" 📍 Meta: {match['meta_file']}")
if not force:
if not is_current:
print("\n⚠️ CROSS-PROJECT REMOVAL!")
print(" This will delete the index from another project.")
response = input(f" ❓ Confirm removal from {location_info}? (y/N): ").strip().lower()
if response not in ["y", "yes"]:
print(" ❌ Removal cancelled.")
return False
if kind == "cli":
return self._delete_index_directory(
match["index_dir"],
index_name,
project_path if not is_current else None,
is_app=False,
)
else:
return self._delete_index_directory(
match["files_dir"],
match.get("display_name", index_name),
project_path if not is_current else None,
is_app=True,
meta_file=match.get("meta_file"),
app_file_base=match.get("file_base"),
)
def _remove_from_multiple_matches(self, matches, index_name: str, force: bool):
"""Handle removal when multiple matches are found"""
print(f"⚠️ Found {len(matches)} indexes named '{index_name}':")
print(" " + "" * 50)
for i, match in enumerate(matches, 1):
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
print(f" {i}. 🏠 Current project ({'CLI' if kind == 'cli' else 'APP'})")
else: else:
print(f" {i}. 📂 {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})") print(f"\n📂 {project_path}:")
# Show path details for index_dir in index_dirs:
if kind == "cli": total_indexes += 1
print(f" 📍 {project_path / '.leann' / 'indexes' / index_name}") index_name = index_dir.name
else: meta_file = index_dir / "documents.leann.meta.json"
print(f" 📍 {match['meta_file']}") status = "" if meta_file.exists() else ""
# Show size info print(f" {total_indexes}. {index_name} [{status}]")
try: if status == "":
if kind == "cli": size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
size_mb = sum( 1024 * 1024
f.stat().st_size for f in match["index_dir"].iterdir() if f.is_file()
) / (1024 * 1024)
else:
file_base = match.get("file_base")
size_mb = 0.0
if file_base:
size_mb = sum(
f.stat().st_size
for f in match["files_dir"].glob(f"{file_base}.leann*")
if f.is_file()
) / (1024 * 1024)
print(f" 📦 Size: {size_mb:.1f} MB")
except (OSError, PermissionError):
pass
print(" " + "" * 50)
if force:
print(" ❌ Multiple matches found, but --force specified.")
print(" Please run without --force to choose which one to remove.")
return False
try:
choice = input(
f" ❓ Which one to remove? (1-{len(matches)}, or 'c' to cancel): "
).strip()
if choice.lower() == "c":
print(" ❌ Removal cancelled.")
return False
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(matches):
selected_match = matches[choice_idx]
project_path = selected_match["project_path"]
is_current = selected_match["is_current"]
kind = selected_match.get("kind", "cli")
location = "current project" if is_current else f"'{project_path.name}' project"
print(f" 🎯 Selected: Remove from {location}")
# Final confirmation for safety
confirm = input(
f" ❓ FINAL CONFIRMATION - Type '{index_name}' to proceed: "
).strip()
if confirm != index_name:
print(" ❌ Confirmation failed. Removal cancelled.")
return False
if kind == "cli":
return self._delete_index_directory(
selected_match["index_dir"],
index_name,
project_path if not is_current else None,
is_app=False,
) )
else: print(f" Size: {size_mb:.1f} MB")
return self._delete_index_directory(
selected_match["files_dir"],
selected_match.get("display_name", index_name),
project_path if not is_current else None,
is_app=True,
meta_file=selected_match.get("meta_file"),
app_file_base=selected_match.get("file_base"),
)
else:
print(" ❌ Invalid choice. Removal cancelled.")
return False
except (ValueError, KeyboardInterrupt): if total_indexes > 0:
print("\n ❌ Invalid input. Removal cancelled.") print(f"\nTotal: {total_indexes} indexes across {len(valid_projects)} projects")
return False print("\nUsage (current project only):")
def _delete_index_directory( # Show example from current project
self, current_indexes_dir = current_dir / ".leann" / "indexes"
index_dir: Path, if current_indexes_dir.exists():
index_display_name: str, current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
project_path: Optional[Path] = None, if current_index_dirs:
is_app: bool = False, example_name = current_index_dirs[0].name
meta_file: Optional[Path] = None, print(f' leann search {example_name} "your query"')
app_file_base: Optional[str] = None, print(f" leann ask {example_name} --interactive")
):
"""Delete a CLI index directory or APP index files safely."""
try:
if is_app:
removed = 0
errors = 0
# Delete only files that belong to this app index (based on file base)
pattern_base = app_file_base or ""
for f in index_dir.glob(f"{pattern_base}.leann*"):
try:
f.unlink()
removed += 1
except Exception:
errors += 1
# Best-effort: also remove the meta file if specified and still exists
if meta_file and meta_file.exists():
try:
meta_file.unlink()
removed += 1
except Exception:
errors += 1
if removed > 0 and errors == 0:
if project_path:
print(
f"✅ App index '{index_display_name}' removed from {project_path.name}"
)
else:
print(f"✅ App index '{index_display_name}' removed successfully")
return True
elif removed > 0 and errors > 0:
print(
f"⚠️ App index '{index_display_name}' partially removed (some files couldn't be deleted)"
)
return True
else:
print(
f"❌ No files found to remove for app index '{index_display_name}' in {index_dir}"
)
return False
else:
import shutil
shutil.rmtree(index_dir)
if project_path:
print(f"✅ Index '{index_display_name}' removed from {project_path.name}")
else:
print(f"✅ Index '{index_display_name}' removed successfully")
return True
except Exception as e:
print(f"❌ Error removing index '{index_display_name}': {e}")
return False
def load_documents( def load_documents(
self, self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
docs_paths: Union[str, list],
custom_file_types: Union[str, None] = None,
include_hidden: bool = False,
): ):
# Handle both single path (string) and multiple paths (list) for backward compatibility # Handle both single path (string) and multiple paths (list) for backward compatibility
if isinstance(docs_paths, str): if isinstance(docs_paths, str):
@@ -876,10 +455,6 @@ Examples:
all_documents = [] all_documents = []
# Helper to detect hidden path components
def _path_has_hidden_segment(p: Path) -> bool:
return any(part.startswith(".") and part not in [".", ".."] for part in p.parts)
# First, process individual files if any # First, process individual files if any
if files: if files:
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...") print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
@@ -892,12 +467,8 @@ Examples:
files_by_dir = defaultdict(list) files_by_dir = defaultdict(list)
for file_path in files: for file_path in files:
file_path_obj = Path(file_path) parent_dir = str(Path(file_path).parent)
if not include_hidden and _path_has_hidden_segment(file_path_obj): files_by_dir[parent_dir].append(file_path)
print(f" ⚠️ Skipping hidden file: {file_path}")
continue
parent_dir = str(file_path_obj.parent)
files_by_dir[parent_dir].append(str(file_path_obj))
# Load files from each parent directory # Load files from each parent directory
for parent_dir, file_list in files_by_dir.items(): for parent_dir, file_list in files_by_dir.items():
@@ -908,7 +479,6 @@ Examples:
file_docs = SimpleDirectoryReader( file_docs = SimpleDirectoryReader(
parent_dir, parent_dir,
input_files=file_list, input_files=file_list,
# exclude_hidden only affects directory scans; input_files are explicit
filename_as_id=True, filename_as_id=True,
).load_data() ).load_data()
all_documents.extend(file_docs) all_documents.extend(file_docs)
@@ -1007,8 +577,6 @@ Examples:
# Check if file matches any exclude pattern # Check if file matches any exclude pattern
try: try:
relative_path = file_path.relative_to(docs_path) relative_path = file_path.relative_to(docs_path)
if not include_hidden and _path_has_hidden_segment(relative_path):
continue
if self._should_exclude_file(relative_path, gitignore_matches): if self._should_exclude_file(relative_path, gitignore_matches):
continue continue
except ValueError: except ValueError:
@@ -1036,7 +604,6 @@ Examples:
try: try:
default_docs = SimpleDirectoryReader( default_docs = SimpleDirectoryReader(
str(file_path.parent), str(file_path.parent),
exclude_hidden=not include_hidden,
filename_as_id=True, filename_as_id=True,
required_exts=[file_path.suffix], required_exts=[file_path.suffix],
).load_data() ).load_data()
@@ -1065,7 +632,6 @@ Examples:
encoding="utf-8", encoding="utf-8",
required_exts=code_extensions, required_exts=code_extensions,
file_extractor={}, # Use default extractors file_extractor={}, # Use default extractors
exclude_hidden=not include_hidden,
filename_as_id=True, filename_as_id=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
@@ -1215,9 +781,7 @@ Examples:
paragraph_separator="\n\n", paragraph_separator="\n\n",
) )
all_texts = self.load_documents( all_texts = self.load_documents(docs_paths, args.file_types)
docs_paths, args.file_types, include_hidden=args.include_hidden
)
if not all_texts: if not all_texts:
print("No documents found") print("No documents found")
return return
@@ -1249,101 +813,13 @@ Examples:
async def search_documents(self, args): async def search_documents(self, args):
index_name = args.index_name index_name = args.index_name
query = args.query query = args.query
# First try to find the index in current project
index_path = self.get_index_path(index_name) index_path = self.get_index_path(index_name)
if self.index_exists(index_name):
# Found in current project, use it
pass
else:
# Search across all registered projects (like list_indexes does)
all_matches = self._find_all_matching_indexes(index_name)
if not all_matches:
print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
)
return
elif len(all_matches) == 1:
# Found exactly one match, use it
match = all_matches[0]
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
# App format: use the meta file to construct the path
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
project_info = ( if not self.index_exists(index_name):
"current project" print(
if match["is_current"] f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
else f"project '{match['project_path'].name}'" )
) return
print(f"Using index '{index_name}' from {project_info}")
else:
# Multiple matches found
if args.non_interactive:
# Non-interactive mode: automatically select the best match
# Priority: current project first, then first available
current_matches = [m for m in all_matches if m["is_current"]]
if current_matches:
match = current_matches[0]
location_desc = "current project"
else:
match = all_matches[0]
location_desc = f"project '{match['project_path'].name}'"
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
print(
f"Found {len(all_matches)} indexes named '{index_name}', using index from {location_desc}"
)
else:
# Interactive mode: ask user to choose
print(f"Found {len(all_matches)} indexes named '{index_name}':")
for i, match in enumerate(all_matches, 1):
project_path = match["project_path"]
is_current = match["is_current"]
kind = match.get("kind", "cli")
if is_current:
print(
f" {i}. 🏠 Current project ({'CLI' if kind == 'cli' else 'APP'})"
)
else:
print(
f" {i}. 📂 {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})"
)
try:
choice = input(f"Which index to search? (1-{len(all_matches)}): ").strip()
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(all_matches):
match = all_matches[choice_idx]
if match["kind"] == "cli":
index_path = str(match["index_dir"] / "documents.leann")
else:
meta_file = match["meta_file"]
file_base = match["file_base"]
index_path = str(meta_file.parent / f"{file_base}.leann")
project_info = (
"current project"
if match["is_current"]
else f"project '{match['project_path'].name}'"
)
print(f"Using index '{index_name}' from {project_info}")
else:
print("Invalid choice. Aborting search.")
return
except (ValueError, KeyboardInterrupt):
print("Invalid input. Aborting search.")
return
searcher = LeannSearcher(index_path=index_path) searcher = LeannSearcher(index_path=index_path)
results = searcher.search( results = searcher.search(
@@ -1442,8 +918,6 @@ Examples:
if args.command == "list": if args.command == "list":
self.list_indexes() self.list_indexes()
elif args.command == "remove":
self.remove_index(args.index_name, args.force)
elif args.command == "build": elif args.command == "build":
await self.build_index(args) await self.build_index(args)
elif args.command == "search": elif args.command == "search":
@@ -1455,15 +929,10 @@ Examples:
def main(): def main():
import logging
import dotenv import dotenv
dotenv.load_dotenv() dotenv.load_dotenv()
# Set clean logging for CLI usage
logging.getLogger().setLevel(logging.WARNING) # Only show warnings and errors
cli = LeannCLI() cli = LeannCLI()
asyncio.run(cli.run()) asyncio.run(cli.run())

View File

@@ -57,8 +57,6 @@ def compute_embeddings(
return compute_embeddings_mlx(texts, model_name) return compute_embeddings_mlx(texts, model_name)
elif mode == "ollama": elif mode == "ollama":
return compute_embeddings_ollama(texts, model_name, is_build=is_build) return compute_embeddings_ollama(texts, model_name, is_build=is_build)
elif mode == "gemini":
return compute_embeddings_gemini(texts, model_name, is_build=is_build)
else: else:
raise ValueError(f"Unsupported embedding mode: {mode}") raise ValueError(f"Unsupported embedding mode: {mode}")
@@ -670,83 +668,3 @@ def compute_embeddings_ollama(
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}") logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
return embeddings return embeddings
def compute_embeddings_gemini(
texts: list[str], model_name: str = "text-embedding-004", is_build: bool = False
) -> np.ndarray:
"""
Compute embeddings using Google Gemini API.
Args:
texts: List of texts to compute embeddings for
model_name: Gemini model name (default: "text-embedding-004")
is_build: Whether this is a build operation (shows progress bar)
Returns:
Embeddings array, shape: (len(texts), embedding_dim)
"""
try:
import os
import google.genai as genai
except ImportError as e:
raise ImportError(f"Google GenAI package not installed: {e}")
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise RuntimeError("GEMINI_API_KEY environment variable not set")
# Cache Gemini client
cache_key = "gemini_client"
if cache_key in _model_cache:
client = _model_cache[cache_key]
else:
client = genai.Client(api_key=api_key)
_model_cache[cache_key] = client
logger.info("Gemini client cached")
logger.info(
f"Computing embeddings for {len(texts)} texts using Gemini API, model: '{model_name}'"
)
# Gemini supports batch embedding
max_batch_size = 100 # Conservative batch size for Gemini
all_embeddings = []
try:
from tqdm import tqdm
total_batches = (len(texts) + max_batch_size - 1) // max_batch_size
batch_range = range(0, len(texts), max_batch_size)
batch_iterator = tqdm(
batch_range, desc="Computing embeddings", unit="batch", total=total_batches
)
except ImportError:
# Fallback when tqdm is not available
batch_iterator = range(0, len(texts), max_batch_size)
for i in batch_iterator:
batch_texts = texts[i : i + max_batch_size]
try:
# Use the embed_content method from the new Google GenAI SDK
response = client.models.embed_content(
model=model_name,
contents=batch_texts,
config=genai.types.EmbedContentConfig(
task_type="RETRIEVAL_DOCUMENT" # For document embedding
),
)
# Extract embeddings from response
for embedding_data in response.embeddings:
all_embeddings.append(embedding_data.values)
except Exception as e:
logger.error(f"Batch {i} failed: {e}")
raise
embeddings = np.array(all_embeddings, dtype=np.float32)
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
return embeddings

View File

@@ -94,7 +94,7 @@ def handle_request(request):
}, },
} }
# Build simplified command with non-interactive flag for MCP compatibility # Build simplified command
cmd = [ cmd = [
"leann", "leann",
"search", "search",
@@ -102,7 +102,6 @@ def handle_request(request):
args["query"], args["query"],
f"--top-k={args.get('top_k', 5)}", f"--top-k={args.get('top_k', 5)}",
f"--complexity={args.get('complexity', 32)}", f"--complexity={args.get('complexity', 32)}",
"--non-interactive",
] ]
result = subprocess.run(cmd, capture_output=True, text=True) result = subprocess.run(cmd, capture_output=True, text=True)

View File

@@ -2,17 +2,11 @@
import importlib import importlib
import importlib.metadata import importlib.metadata
import json from typing import TYPE_CHECKING
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Union
if TYPE_CHECKING: if TYPE_CHECKING:
from leann.interface import LeannBackendFactoryInterface from leann.interface import LeannBackendFactoryInterface
# Set up logger for this module
logger = logging.getLogger(__name__)
BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {} BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {}
@@ -20,7 +14,7 @@ def register_backend(name: str):
"""A decorator to register a new backend class.""" """A decorator to register a new backend class."""
def decorator(cls): def decorator(cls):
logger.debug(f"Registering backend '{name}'") print(f"INFO: Registering backend '{name}'")
BACKEND_REGISTRY[name] = cls BACKEND_REGISTRY[name] = cls
return cls return cls
@@ -45,54 +39,3 @@ def autodiscover_backends():
# print(f"WARN: Could not import backend module '{backend_module_name}': {e}") # print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
pass pass
# print("INFO: Backend auto-discovery finished.") # print("INFO: Backend auto-discovery finished.")
def register_project_directory(project_dir: Optional[Union[str, Path]] = None):
"""
Register a project directory in the global LEANN registry.
This allows `leann list` to discover indexes created by apps or other tools.
Args:
project_dir: Directory to register. If None, uses current working directory.
"""
if project_dir is None:
project_dir = Path.cwd()
else:
project_dir = Path(project_dir)
# Only register directories that have some kind of LEANN content
# Either .leann/indexes/ (CLI format) or *.leann.meta.json files (apps format)
has_cli_indexes = (project_dir / ".leann" / "indexes").exists()
has_app_indexes = any(project_dir.rglob("*.leann.meta.json"))
if not (has_cli_indexes or has_app_indexes):
# Don't register if there are no LEANN indexes
return
global_registry = Path.home() / ".leann" / "projects.json"
global_registry.parent.mkdir(exist_ok=True)
project_str = str(project_dir.resolve())
# Load existing registry
projects = []
if global_registry.exists():
try:
with open(global_registry) as f:
projects = json.load(f)
except Exception:
logger.debug("Could not load existing project registry")
projects = []
# Add project if not already present
if project_str not in projects:
projects.append(project_str)
# Save updated registry
try:
with open(global_registry, "w") as f:
json.dump(projects, f, indent=2)
logger.debug(f"Registered project directory: {project_str}")
except Exception as e:
logger.warning(f"Could not save project registry: {e}")

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.3.2" version = "0.2.9"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -14,6 +14,8 @@ dependencies = [
"numpy>=1.26.0", "numpy>=1.26.0",
"torch", "torch",
"tqdm", "tqdm",
"flask",
"flask_compress",
"datasets>=2.15.0", "datasets>=2.15.0",
"evaluate", "evaluate",
"colorama", "colorama",
@@ -64,7 +66,9 @@ test = [
"pytest>=7.0", "pytest>=7.0",
"pytest-timeout>=2.0", "pytest-timeout>=2.0",
"llama-index-core>=0.12.0", "llama-index-core>=0.12.0",
"llama-index-readers-file>=0.4.0",
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"sentence-transformers>=2.2.0",
] ]
diskann = [ diskann = [
@@ -96,8 +100,13 @@ leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
[tool.ruff] [tool.ruff]
target-version = "py39" target-version = "py39"
line-length = 100 line-length = 100
extend-exclude = ["third_party"] extend-exclude = [
"third_party",
"*.egg-info",
"__pycache__",
".git",
".venv",
]
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
@@ -120,12 +129,21 @@ ignore = [
"RUF012", # mutable class attributes should be annotated with typing.ClassVar "RUF012", # mutable class attributes should be annotated with typing.ClassVar
] ]
[tool.ruff.lint.per-file-ignores]
"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests)
"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples)
[tool.ruff.format] [tool.ruff.format]
quote-style = "double" quote-style = "double"
indent-style = "space" indent-style = "space"
skip-magic-trailing-comma = false skip-magic-trailing-comma = false
line-ending = "auto" line-ending = "auto"
[dependency-groups]
dev = [
"ruff>=0.12.4",
]
[tool.lychee] [tool.lychee]
accept = ["200", "403", "429", "503"] accept = ["200", "403", "429", "503"]
timeout = 20 timeout = 20

7313
uv.lock generated
View File

File diff suppressed because it is too large Load Diff