diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml
index a80b5e5..56c46cc 100644
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -64,6 +64,16 @@ jobs:
python: '3.12'
- os: macos-14
python: '3.13'
+ - os: macos-15
+ python: '3.9'
+ - os: macos-15
+ python: '3.10'
+ - os: macos-15
+ python: '3.11'
+ - os: macos-15
+ python: '3.12'
+ - os: macos-15
+ python: '3.13'
- os: macos-13
python: '3.9'
- os: macos-13
@@ -147,7 +157,14 @@ jobs:
# Use system clang for better compatibility
export CC=clang
export CXX=clang++
- export MACOSX_DEPLOYMENT_TARGET=11.0
+ # Homebrew libraries on each macOS version require matching minimum version
+ if [[ "${{ matrix.os }}" == "macos-13" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=13.0
+ elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=14.0
+ elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=15.0
+ fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
@@ -161,7 +178,14 @@ jobs:
export CC=clang
export CXX=clang++
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
- export MACOSX_DEPLOYMENT_TARGET=13.3
+ # But Homebrew libraries on each macOS version require matching minimum version
+ if [[ "${{ matrix.os }}" == "macos-13" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=13.3
+ elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=14.0
+ elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
+ export MACOSX_DEPLOYMENT_TARGET=15.0
+ fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
@@ -197,10 +221,24 @@ jobs:
- name: Repair wheels (macOS)
if: runner.os == 'macOS'
run: |
+ # Determine deployment target based on runner OS
+ # Must match the Homebrew libraries for each macOS version
+ if [[ "${{ matrix.os }}" == "macos-13" ]]; then
+ HNSW_TARGET="13.0"
+ DISKANN_TARGET="13.3"
+ elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
+ HNSW_TARGET="14.0"
+ DISKANN_TARGET="14.0"
+ elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
+ HNSW_TARGET="15.0"
+ DISKANN_TARGET="15.0"
+ fi
+
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
- delocate-wheel -w dist_repaired -v dist/*.whl
+ export MACOSX_DEPLOYMENT_TARGET=$HNSW_TARGET
+ delocate-wheel -w dist_repaired -v --require-target-macos-version $HNSW_TARGET dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
@@ -209,7 +247,8 @@ jobs:
# Repair DiskANN wheel
cd packages/leann-backend-diskann
if [ -d dist ]; then
- delocate-wheel -w dist_repaired -v dist/*.whl
+ export MACOSX_DEPLOYMENT_TARGET=$DISKANN_TARGET
+ delocate-wheel -w dist_repaired -v --require-target-macos-version $DISKANN_TARGET dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
diff --git a/README.md b/README.md
index 12802f3..be6c827 100755
--- a/README.md
+++ b/README.md
@@ -190,7 +190,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
--force-rebuild # Force rebuild index even if it exists
# Embedding Parameters
---embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
+--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text,mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
# LLM Parameters (Text generation models)
@@ -468,7 +468,7 @@ leann --help
### Usage Examples
```bash
-# build from a specific directory, and my_docs is the index name
+# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
leann build my-docs --docs ./your_documents
# Search your documents
@@ -611,8 +611,9 @@ We welcome more contributors! Feel free to open issues or submit PRs.
This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.edu/).
----
+## Star History
+[](https://www.star-history.com/#yichuan-w/LEANN&Date)
⭐ Star us on GitHub if Leann is useful for your research or applications!
diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml
index 055a1e7..1918007 100644
--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-diskann"
-version = "0.2.7"
-dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]
+version = "0.2.8"
+dependencies = ["leann-core==0.2.8", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build]
# Key: simplified CMake path
diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt
index 651792c..12e19ef 100644
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -13,7 +13,7 @@ if(APPLE)
else()
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
endif()
-
+
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
set(OpenMP_C_LIB_NAMES "omp")
diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml
index c3657e6..0b1f04d 100644
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-hnsw"
-version = "0.2.7"
+version = "0.2.8"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [
- "leann-core==0.2.7",
+ "leann-core==0.2.8",
"numpy",
"pyzmq>=23.0.0",
"msgpack>=1.0.0",
diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml
index 98e7d12..be09d29 100644
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann-core"
-version = "0.2.7"
+version = "0.2.8"
description = "Core API and plugin system for LEANN"
readme = "README.md"
requires-python = ">=3.9"
diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index 31dca55..9f6911f 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -5,6 +5,7 @@ from typing import Union
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
+from tqdm import tqdm
from .api import LeannBuilder, LeannChat, LeannSearcher
@@ -75,11 +76,14 @@ class LeannCLI:
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
- leann build my-docs --docs ./documents # Build index named my-docs
- leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
- leann search my-docs "query" # Search in my-docs index
- leann ask my-docs "question" # Ask my-docs index
- leann list # List all stored indexes
+ leann build my-docs --docs ./documents # Build index from directory
+ leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
+ leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
+ leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
+ leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
+ leann search my-docs "query" # Search in my-docs index
+ leann ask my-docs "question" # Ask my-docs index
+ leann list # List all stored indexes
""",
)
@@ -91,7 +95,11 @@ Examples:
"index_name", nargs="?", help="Index name (default: current directory name)"
)
build_parser.add_argument(
- "--docs", type=str, default=".", help="Documents directory (default: current directory)"
+ "--docs",
+ type=str,
+ nargs="+",
+ default=["."],
+ help="Documents directories and/or files (default: current directory)",
)
build_parser.add_argument(
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
@@ -235,6 +243,32 @@ Examples:
"""Check if a file should be excluded using gitignore parser."""
return gitignore_matches(str(relative_path))
+ def _is_git_submodule(self, path: Path) -> bool:
+ """Check if a path is a git submodule."""
+ try:
+ # Find the git repo root
+ current_dir = Path.cwd()
+ while current_dir != current_dir.parent:
+ if (current_dir / ".git").exists():
+ gitmodules_path = current_dir / ".gitmodules"
+ if gitmodules_path.exists():
+ # Read .gitmodules to check if this path is a submodule
+ gitmodules_content = gitmodules_path.read_text()
+ # Convert path to relative to git root
+ try:
+ relative_path = path.resolve().relative_to(current_dir)
+ # Check if this path appears in .gitmodules
+ return f"path = {relative_path}" in gitmodules_content
+ except ValueError:
+ # Path is not under git root
+ return False
+ break
+ current_dir = current_dir.parent
+ return False
+ except Exception:
+ # If anything goes wrong, assume it's not a submodule
+ return False
+
def list_indexes(self):
print("Stored LEANN indexes:")
@@ -264,7 +298,9 @@ Examples:
valid_projects.append(current_path)
if not valid_projects:
- print("No indexes found. Use 'leann build --docs ' to create one.")
+ print(
+ "No indexes found. Use 'leann build --docs [ ...]' to create one."
+ )
return
total_indexes = 0
@@ -311,56 +347,88 @@ Examples:
print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive")
- def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None):
- print(f"Loading documents from {docs_dir}...")
+ def load_documents(
+ self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
+ ):
+ # Handle both single path (string) and multiple paths (list) for backward compatibility
+ if isinstance(docs_paths, str):
+ docs_paths = [docs_paths]
+
+ # Separate files and directories
+ files = []
+ directories = []
+ for path in docs_paths:
+ path_obj = Path(path)
+ if path_obj.is_file():
+ files.append(str(path_obj))
+ elif path_obj.is_dir():
+ # Check if this is a git submodule - if so, skip it
+ if self._is_git_submodule(path_obj):
+ print(f"⚠️ Skipping git submodule: {path}")
+ continue
+ directories.append(str(path_obj))
+ else:
+ print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
+ continue
+
+ # Print summary of what we're processing
+ total_items = len(files) + len(directories)
+ items_desc = []
+ if files:
+ items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
+ if directories:
+ items_desc.append(
+ f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
+ )
+
+ print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
+ if files:
+ print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
+ if directories:
+ print(f" 📁 Directories: {', '.join(directories)}")
+
if custom_file_types:
print(f"Using custom file types: {custom_file_types}")
- # Build gitignore parser
- gitignore_matches = self._build_gitignore_parser(docs_dir)
+ all_documents = []
- # Try to use better PDF parsers first, but only if PDFs are requested
- documents = []
- docs_path = Path(docs_dir)
+ # First, process individual files if any
+ if files:
+ print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
- # Check if we should process PDFs
- should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
+ # Load individual files using SimpleDirectoryReader with input_files
+ # Note: We skip gitignore filtering for explicitly specified files
+ try:
+ # Group files by their parent directory for efficient loading
+ from collections import defaultdict
- if should_process_pdfs:
- for file_path in docs_path.rglob("*.pdf"):
- # Check if file matches any exclude pattern
- relative_path = file_path.relative_to(docs_path)
- if self._should_exclude_file(relative_path, gitignore_matches):
- continue
+ files_by_dir = defaultdict(list)
+ for file_path in files:
+ parent_dir = str(Path(file_path).parent)
+ files_by_dir[parent_dir].append(file_path)
- print(f"Processing PDF: {file_path}")
-
- # Try PyMuPDF first (best quality)
- text = extract_pdf_text_with_pymupdf(str(file_path))
- if text is None:
- # Try pdfplumber
- text = extract_pdf_text_with_pdfplumber(str(file_path))
-
- if text:
- # Create a simple document structure
- from llama_index.core import Document
-
- doc = Document(text=text, metadata={"source": str(file_path)})
- documents.append(doc)
- else:
- # Fallback to default reader
- print(f"Using default reader for {file_path}")
+ # Load files from each parent directory
+ for parent_dir, file_list in files_by_dir.items():
+ print(
+ f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
+ )
try:
- default_docs = SimpleDirectoryReader(
- str(file_path.parent),
+ file_docs = SimpleDirectoryReader(
+ parent_dir,
+ input_files=file_list,
filename_as_id=True,
- required_exts=[file_path.suffix],
).load_data()
- documents.extend(default_docs)
+ all_documents.extend(file_docs)
+ print(
+ f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
+ )
except Exception as e:
- print(f"Warning: Could not process {file_path}: {e}")
+ print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
- # Load other file types with default reader
+ except Exception as e:
+ print(f"❌ Error processing individual files: {e}")
+
+ # Define file extensions to process
if custom_file_types:
# Parse custom file types from comma-separated string
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
@@ -422,41 +490,106 @@ Examples:
".py",
".jl",
]
- # Try to load other file types, but don't fail if none are found
- try:
- # Create a custom file filter function using our PathSpec
- def file_filter(file_path: str) -> bool:
- """Return True if file should be included (not excluded)"""
- try:
- docs_path_obj = Path(docs_dir)
- file_path_obj = Path(file_path)
- relative_path = file_path_obj.relative_to(docs_path_obj)
- return not self._should_exclude_file(relative_path, gitignore_matches)
- except (ValueError, OSError):
- return True # Include files that can't be processed
- other_docs = SimpleDirectoryReader(
- docs_dir,
- recursive=True,
- encoding="utf-8",
- required_exts=code_extensions,
- file_extractor={}, # Use default extractors
- filename_as_id=True,
- ).load_data(show_progress=True)
+ # Process each directory
+ if directories:
+ print(
+ f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
+ )
- # Filter documents after loading based on gitignore rules
- filtered_docs = []
- for doc in other_docs:
- file_path = doc.metadata.get("file_path", "")
- if file_filter(file_path):
- filtered_docs.append(doc)
+ for docs_dir in directories:
+ print(f"Processing directory: {docs_dir}")
+ # Build gitignore parser for each directory
+ gitignore_matches = self._build_gitignore_parser(docs_dir)
- documents.extend(filtered_docs)
- except ValueError as e:
- if "No files found" in str(e):
- print("No additional files found for other supported types.")
- else:
- raise e
+ # Try to use better PDF parsers first, but only if PDFs are requested
+ documents = []
+ docs_path = Path(docs_dir)
+
+ # Check if we should process PDFs
+ should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
+
+ if should_process_pdfs:
+ for file_path in docs_path.rglob("*.pdf"):
+ # Check if file matches any exclude pattern
+ try:
+ relative_path = file_path.relative_to(docs_path)
+ if self._should_exclude_file(relative_path, gitignore_matches):
+ continue
+ except ValueError:
+ # Skip files that can't be made relative to docs_path
+ print(f"⚠️ Skipping file outside directory scope: {file_path}")
+ continue
+
+ print(f"Processing PDF: {file_path}")
+
+ # Try PyMuPDF first (best quality)
+ text = extract_pdf_text_with_pymupdf(str(file_path))
+ if text is None:
+ # Try pdfplumber
+ text = extract_pdf_text_with_pdfplumber(str(file_path))
+
+ if text:
+ # Create a simple document structure
+ from llama_index.core import Document
+
+ doc = Document(text=text, metadata={"source": str(file_path)})
+ documents.append(doc)
+ else:
+ # Fallback to default reader
+ print(f"Using default reader for {file_path}")
+ try:
+ default_docs = SimpleDirectoryReader(
+ str(file_path.parent),
+ filename_as_id=True,
+ required_exts=[file_path.suffix],
+ ).load_data()
+ documents.extend(default_docs)
+ except Exception as e:
+ print(f"Warning: Could not process {file_path}: {e}")
+
+ # Load other file types with default reader
+ try:
+ # Create a custom file filter function using our PathSpec
+ def file_filter(
+ file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
+ ) -> bool:
+ """Return True if file should be included (not excluded)"""
+ try:
+ docs_path_obj = Path(docs_dir)
+ file_path_obj = Path(file_path)
+ relative_path = file_path_obj.relative_to(docs_path_obj)
+ return not self._should_exclude_file(relative_path, gitignore_matches)
+ except (ValueError, OSError):
+ return True # Include files that can't be processed
+
+ other_docs = SimpleDirectoryReader(
+ docs_dir,
+ recursive=True,
+ encoding="utf-8",
+ required_exts=code_extensions,
+ file_extractor={}, # Use default extractors
+ filename_as_id=True,
+ ).load_data(show_progress=True)
+
+ # Filter documents after loading based on gitignore rules
+ filtered_docs = []
+ for doc in other_docs:
+ file_path = doc.metadata.get("file_path", "")
+ if file_filter(file_path):
+ filtered_docs.append(doc)
+
+ documents.extend(filtered_docs)
+ except ValueError as e:
+ if "No files found" in str(e):
+ print(f"No additional files found for other supported types in {docs_dir}.")
+ else:
+ raise e
+
+ all_documents.extend(documents)
+ print(f"Loaded {len(documents)} documents from {docs_dir}")
+
+ documents = all_documents
all_texts = []
@@ -507,7 +640,9 @@ Examples:
".jl",
}
- for doc in documents:
+ print("start chunking documents")
+ # Add progress bar for document chunking
+ for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
# Check if this is a code file based on source path
source_path = doc.metadata.get("source", "")
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
@@ -523,7 +658,7 @@ Examples:
return all_texts
async def build_index(self, args):
- docs_dir = args.docs
+ docs_paths = args.docs
# Use current directory name if index_name not provided
if args.index_name:
index_name = args.index_name
@@ -534,13 +669,25 @@ Examples:
index_dir = self.indexes_dir / index_name
index_path = self.get_index_path(index_name)
- print(f"📂 Indexing: {Path(docs_dir).resolve()}")
+ # Display all paths being indexed with file/directory distinction
+ files = [p for p in docs_paths if Path(p).is_file()]
+ directories = [p for p in docs_paths if Path(p).is_dir()]
+
+ print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
+ if files:
+ print(f" 📄 Files ({len(files)}):")
+ for i, file_path in enumerate(files, 1):
+ print(f" {i}. {Path(file_path).resolve()}")
+ if directories:
+ print(f" 📁 Directories ({len(directories)}):")
+ for i, dir_path in enumerate(directories, 1):
+ print(f" {i}. {Path(dir_path).resolve()}")
if index_dir.exists() and not args.force:
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return
- all_texts = self.load_documents(docs_dir, args.file_types)
+ all_texts = self.load_documents(docs_paths, args.file_types)
if not all_texts:
print("No documents found")
return
@@ -576,7 +723,7 @@ Examples:
if not self.index_exists(index_name):
print(
- f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it."
+ f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it."
)
return
@@ -603,7 +750,7 @@ Examples:
if not self.index_exists(index_name):
print(
- f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it."
+ f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it."
)
return
diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py
index 67f33d1..1a19835 100644
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -6,7 +6,6 @@ Preserves all optimization parameters to ensure performance
import logging
import os
-from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any
import numpy as np
@@ -374,7 +373,9 @@ def compute_embeddings_ollama(
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
) -> np.ndarray:
"""
- Compute embeddings using Ollama API.
+ Compute embeddings using Ollama API with simplified batch processing.
+
+ Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
Args:
texts: List of texts to compute embeddings for
@@ -438,12 +439,19 @@ def compute_embeddings_ollama(
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
embedding_models.append(model)
- # Check if model exists (handle versioned names)
- model_found = any(
- model_name == name.split(":")[0] or model_name == name for name in model_names
- )
+ # Check if model exists (handle versioned names) and resolve to full name
+ resolved_model_name = None
+ for name in model_names:
+ # Exact match
+ if model_name == name:
+ resolved_model_name = name
+ break
+ # Match without version tag (use the versioned name)
+ elif model_name == name.split(":")[0]:
+ resolved_model_name = name
+ break
- if not model_found:
+ if not resolved_model_name:
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
# Suggest pulling the model
@@ -465,6 +473,11 @@ def compute_embeddings_ollama(
error_msg += "\n📚 Browse more: https://ollama.com/library"
raise ValueError(error_msg)
+ # Use the resolved model name for all subsequent operations
+ if resolved_model_name != model_name:
+ logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
+ model_name = resolved_model_name
+
# Verify the model supports embeddings by testing it
try:
test_response = requests.post(
@@ -485,138 +498,148 @@ def compute_embeddings_ollama(
except requests.exceptions.RequestException as e:
logger.warning(f"Could not verify model existence: {e}")
- # Process embeddings with optimized concurrent processing
- import requests
+ # Determine batch size based on device availability
+ # Check for CUDA/MPS availability using torch if available
+ batch_size = 32 # Default for MPS/CPU
+ try:
+ import torch
- def get_single_embedding(text_idx_tuple):
- """Helper function to get embedding for a single text."""
- text, idx = text_idx_tuple
- max_retries = 3
- retry_count = 0
+ if torch.cuda.is_available():
+ batch_size = 128 # CUDA gets larger batch size
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ batch_size = 32 # MPS gets smaller batch size
+ except ImportError:
+ # If torch is not available, use conservative batch size
+ batch_size = 32
- # Truncate very long texts to avoid API issues
- truncated_text = text[:8000] if len(text) > 8000 else text
+ logger.info(f"Using batch size: {batch_size}")
- while retry_count < max_retries:
- try:
- response = requests.post(
- f"{host}/api/embeddings",
- json={"model": model_name, "prompt": truncated_text},
- timeout=30,
- )
- response.raise_for_status()
+ def get_batch_embeddings(batch_texts):
+ """Get embeddings for a batch of texts."""
+ all_embeddings = []
+ failed_indices = []
- result = response.json()
- embedding = result.get("embedding")
+ for i, text in enumerate(batch_texts):
+ max_retries = 3
+ retry_count = 0
- if embedding is None:
- raise ValueError(f"No embedding returned for text {idx}")
-
- return idx, embedding
-
- except requests.exceptions.Timeout:
- retry_count += 1
- if retry_count >= max_retries:
- logger.warning(f"Timeout for text {idx} after {max_retries} retries")
- return idx, None
-
- except Exception as e:
- if retry_count >= max_retries - 1:
- logger.error(f"Failed to get embedding for text {idx}: {e}")
- return idx, None
- retry_count += 1
-
- return idx, None
-
- # Determine if we should use concurrent processing
- use_concurrent = (
- len(texts) > 5 and not is_build
- ) # Don't use concurrent in build mode to avoid overwhelming
- max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama
-
- all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order
- failed_indices = []
-
- if use_concurrent:
- logger.info(
- f"Using concurrent processing with {max_workers} workers for {len(texts)} texts"
- )
-
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- # Submit all tasks
- future_to_idx = {
- executor.submit(get_single_embedding, (text, idx)): idx
- for idx, text in enumerate(texts)
- }
-
- # Add progress bar for concurrent processing
- try:
- if is_build or len(texts) > 10:
- from tqdm import tqdm
-
- futures_iterator = tqdm(
- as_completed(future_to_idx),
- total=len(texts),
- desc="Computing Ollama embeddings",
- )
- else:
- futures_iterator = as_completed(future_to_idx)
- except ImportError:
- futures_iterator = as_completed(future_to_idx)
-
- # Collect results as they complete
- for future in futures_iterator:
+ # Truncate very long texts to avoid API issues
+ truncated_text = text[:8000] if len(text) > 8000 else text
+ while retry_count < max_retries:
try:
- idx, embedding = future.result()
- if embedding is not None:
- all_embeddings[idx] = embedding
- else:
- failed_indices.append(idx)
+ response = requests.post(
+ f"{host}/api/embeddings",
+ json={"model": model_name, "prompt": truncated_text},
+ timeout=30,
+ )
+ response.raise_for_status()
+
+ result = response.json()
+ embedding = result.get("embedding")
+
+ if embedding is None:
+ raise ValueError(f"No embedding returned for text {i}")
+
+ if not isinstance(embedding, list) or len(embedding) == 0:
+ raise ValueError(f"Invalid embedding format for text {i}")
+
+ all_embeddings.append(embedding)
+ break
+
+ except requests.exceptions.Timeout:
+ retry_count += 1
+ if retry_count >= max_retries:
+ logger.warning(f"Timeout for text {i} after {max_retries} retries")
+ failed_indices.append(i)
+ all_embeddings.append(None)
+ break
+
except Exception as e:
- idx = future_to_idx[future]
- logger.error(f"Exception for text {idx}: {e}")
- failed_indices.append(idx)
+ retry_count += 1
+ if retry_count >= max_retries:
+ logger.error(f"Failed to get embedding for text {i}: {e}")
+ failed_indices.append(i)
+ all_embeddings.append(None)
+ break
+ return all_embeddings, failed_indices
+ # Process texts in batches
+ all_embeddings = []
+ all_failed_indices = []
+
+ # Setup progress bar if needed
+ show_progress = is_build or len(texts) > 10
+ try:
+ if show_progress:
+ from tqdm import tqdm
+ except ImportError:
+ show_progress = False
+
+ # Process batches
+ num_batches = (len(texts) + batch_size - 1) // batch_size
+
+ if show_progress:
+ batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
else:
- # Sequential processing with progress bar
- show_progress = is_build or len(texts) > 10
+ batch_iterator = range(num_batches)
- try:
- if show_progress:
- from tqdm import tqdm
+ for batch_idx in batch_iterator:
+ start_idx = batch_idx * batch_size
+ end_idx = min(start_idx + batch_size, len(texts))
+ batch_texts = texts[start_idx:end_idx]
- iterator = tqdm(
- enumerate(texts), total=len(texts), desc="Computing Ollama embeddings"
- )
- else:
- iterator = enumerate(texts)
- except ImportError:
- iterator = enumerate(texts)
+ batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
- for idx, text in iterator:
- result_idx, embedding = get_single_embedding((text, idx))
- if embedding is not None:
- all_embeddings[idx] = embedding
- else:
- failed_indices.append(idx)
+ # Adjust failed indices to global indices
+ global_failed = [start_idx + idx for idx in batch_failed]
+ all_failed_indices.extend(global_failed)
+ all_embeddings.extend(batch_embeddings)
# Handle failed embeddings
- if failed_indices:
- if len(failed_indices) == len(texts):
+ if all_failed_indices:
+ if len(all_failed_indices) == len(texts):
raise RuntimeError("Failed to compute any embeddings")
- logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts")
+ logger.warning(
+ f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
+ )
# Use zero embeddings as fallback for failed ones
valid_embedding = next((e for e in all_embeddings if e is not None), None)
if valid_embedding:
embedding_dim = len(valid_embedding)
- for idx in failed_indices:
- all_embeddings[idx] = [0.0] * embedding_dim
+ for i, embedding in enumerate(all_embeddings):
+ if embedding is None:
+ all_embeddings[i] = [0.0] * embedding_dim
- # Remove None values and convert to numpy array
+ # Remove None values
all_embeddings = [e for e in all_embeddings if e is not None]
+ if not all_embeddings:
+ raise RuntimeError("No valid embeddings were computed")
+
+ # Validate embedding dimensions
+ expected_dim = len(all_embeddings[0])
+ inconsistent_dims = []
+ for i, embedding in enumerate(all_embeddings):
+ if len(embedding) != expected_dim:
+ inconsistent_dims.append((i, len(embedding)))
+
+ if inconsistent_dims:
+ error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
+ for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
+ error_msg += f" - Text {idx}: {dim} dimensions\n"
+ if len(inconsistent_dims) > 10:
+ error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
+ error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
+ error_msg += "1. Restart Ollama service: 'ollama serve'\n"
+ error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
+ error_msg += (
+ "3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
+ )
+ error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
+ raise ValueError(error_msg)
+
# Convert to numpy array and normalize
embeddings = np.array(all_embeddings, dtype=np.float32)
diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md
index b762ae9..eb86488 100644
--- a/packages/leann-mcp/README.md
+++ b/packages/leann-mcp/README.md
@@ -45,6 +45,42 @@ leann build my-project --docs ./
claude
```
+## 🚀 Advanced Usage Examples
+
+### Index Entire Git Repository
+```bash
+# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
+leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# Index only specific file types from git
+leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+```
+
+### Multiple Directories and Files
+```bash
+# Index multiple directories
+leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# Mix files and directories
+leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# Specific files only
+leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+```
+
+### Advanced Git Integration
+```bash
+# Index recently modified files
+leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# Index files matching pattern
+leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+
+# Index documentation and config files
+leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
+```
+
+
**Try this in Claude Code:**
```
Help me understand this codebase. List available indexes and search for authentication patterns.
diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml
index 52fbeb5..2949afa 100644
--- a/packages/leann/pyproject.toml
+++ b/packages/leann/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann"
-version = "0.2.7"
+version = "0.2.8"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md"
requires-python = ">=3.9"