diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index a80b5e5..56c46cc 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -64,6 +64,16 @@ jobs: python: '3.12' - os: macos-14 python: '3.13' + - os: macos-15 + python: '3.9' + - os: macos-15 + python: '3.10' + - os: macos-15 + python: '3.11' + - os: macos-15 + python: '3.12' + - os: macos-15 + python: '3.13' - os: macos-13 python: '3.9' - os: macos-13 @@ -147,7 +157,14 @@ jobs: # Use system clang for better compatibility export CC=clang export CXX=clang++ - export MACOSX_DEPLOYMENT_TARGET=11.0 + # Homebrew libraries on each macOS version require matching minimum version + if [[ "${{ matrix.os }}" == "macos-13" ]]; then + export MACOSX_DEPLOYMENT_TARGET=13.0 + elif [[ "${{ matrix.os }}" == "macos-14" ]]; then + export MACOSX_DEPLOYMENT_TARGET=14.0 + elif [[ "${{ matrix.os }}" == "macos-15" ]]; then + export MACOSX_DEPLOYMENT_TARGET=15.0 + fi uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist else uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist @@ -161,7 +178,14 @@ jobs: export CC=clang export CXX=clang++ # DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function - export MACOSX_DEPLOYMENT_TARGET=13.3 + # But Homebrew libraries on each macOS version require matching minimum version + if [[ "${{ matrix.os }}" == "macos-13" ]]; then + export MACOSX_DEPLOYMENT_TARGET=13.3 + elif [[ "${{ matrix.os }}" == "macos-14" ]]; then + export MACOSX_DEPLOYMENT_TARGET=14.0 + elif [[ "${{ matrix.os }}" == "macos-15" ]]; then + export MACOSX_DEPLOYMENT_TARGET=15.0 + fi uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist else uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist @@ -197,10 +221,24 @@ jobs: - name: Repair wheels (macOS) if: runner.os == 'macOS' run: | + # Determine deployment target based on runner OS + # Must match the Homebrew libraries for each macOS version + if [[ "${{ matrix.os }}" == "macos-13" ]]; then + HNSW_TARGET="13.0" + DISKANN_TARGET="13.3" + elif [[ "${{ matrix.os }}" == "macos-14" ]]; then + HNSW_TARGET="14.0" + DISKANN_TARGET="14.0" + elif [[ "${{ matrix.os }}" == "macos-15" ]]; then + HNSW_TARGET="15.0" + DISKANN_TARGET="15.0" + fi + # Repair HNSW wheel cd packages/leann-backend-hnsw if [ -d dist ]; then - delocate-wheel -w dist_repaired -v dist/*.whl + export MACOSX_DEPLOYMENT_TARGET=$HNSW_TARGET + delocate-wheel -w dist_repaired -v --require-target-macos-version $HNSW_TARGET dist/*.whl rm -rf dist mv dist_repaired dist fi @@ -209,7 +247,8 @@ jobs: # Repair DiskANN wheel cd packages/leann-backend-diskann if [ -d dist ]; then - delocate-wheel -w dist_repaired -v dist/*.whl + export MACOSX_DEPLOYMENT_TARGET=$DISKANN_TARGET + delocate-wheel -w dist_repaired -v --require-target-macos-version $DISKANN_TARGET dist/*.whl rm -rf dist mv dist_repaired dist fi diff --git a/README.md b/README.md index 12802f3..be6c827 100755 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl --force-rebuild # Force rebuild index even if it exists # Embedding Parameters ---embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text +--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text,mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text --embedding-mode MODE # sentence-transformers, openai, mlx, or ollama # LLM Parameters (Text generation models) @@ -468,7 +468,7 @@ leann --help ### Usage Examples ```bash -# build from a specific directory, and my_docs is the index name +# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files) leann build my-docs --docs ./your_documents # Search your documents @@ -611,8 +611,9 @@ We welcome more contributors! Feel free to open issues or submit PRs. This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.edu/). ---- +## Star History +[![Star History Chart](https://api.star-history.com/svg?repos=yichuan-w/LEANN&type=Date)](https://www.star-history.com/#yichuan-w/LEANN&Date)

⭐ Star us on GitHub if Leann is useful for your research or applications!

diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml index 055a1e7..1918007 100644 --- a/packages/leann-backend-diskann/pyproject.toml +++ b/packages/leann-backend-diskann/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-diskann" -version = "0.2.7" -dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"] +version = "0.2.8" +dependencies = ["leann-core==0.2.8", "numpy", "protobuf>=3.19.0"] [tool.scikit-build] # Key: simplified CMake path diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt index 651792c..12e19ef 100644 --- a/packages/leann-backend-hnsw/CMakeLists.txt +++ b/packages/leann-backend-hnsw/CMakeLists.txt @@ -13,7 +13,7 @@ if(APPLE) else() message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp") endif() - + set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include") set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include") set(OpenMP_C_LIB_NAMES "omp") diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml index c3657e6..0b1f04d 100644 --- a/packages/leann-backend-hnsw/pyproject.toml +++ b/packages/leann-backend-hnsw/pyproject.toml @@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-hnsw" -version = "0.2.7" +version = "0.2.8" description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." dependencies = [ - "leann-core==0.2.7", + "leann-core==0.2.8", "numpy", "pyzmq>=23.0.0", "msgpack>=1.0.0", diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index 98e7d12..be09d29 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann-core" -version = "0.2.7" +version = "0.2.8" description = "Core API and plugin system for LEANN" readme = "README.md" requires-python = ">=3.9" diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 31dca55..9f6911f 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -5,6 +5,7 @@ from typing import Union from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter +from tqdm import tqdm from .api import LeannBuilder, LeannChat, LeannSearcher @@ -75,11 +76,14 @@ class LeannCLI: formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - leann build my-docs --docs ./documents # Build index named my-docs - leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files - leann search my-docs "query" # Search in my-docs index - leann ask my-docs "question" # Ask my-docs index - leann list # List all stored indexes + leann build my-docs --docs ./documents # Build index from directory + leann build my-code --docs ./src ./tests ./config # Build index from multiple directories + leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories + leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs + leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files + leann search my-docs "query" # Search in my-docs index + leann ask my-docs "question" # Ask my-docs index + leann list # List all stored indexes """, ) @@ -91,7 +95,11 @@ Examples: "index_name", nargs="?", help="Index name (default: current directory name)" ) build_parser.add_argument( - "--docs", type=str, default=".", help="Documents directory (default: current directory)" + "--docs", + type=str, + nargs="+", + default=["."], + help="Documents directories and/or files (default: current directory)", ) build_parser.add_argument( "--backend", type=str, default="hnsw", choices=["hnsw", "diskann"] @@ -235,6 +243,32 @@ Examples: """Check if a file should be excluded using gitignore parser.""" return gitignore_matches(str(relative_path)) + def _is_git_submodule(self, path: Path) -> bool: + """Check if a path is a git submodule.""" + try: + # Find the git repo root + current_dir = Path.cwd() + while current_dir != current_dir.parent: + if (current_dir / ".git").exists(): + gitmodules_path = current_dir / ".gitmodules" + if gitmodules_path.exists(): + # Read .gitmodules to check if this path is a submodule + gitmodules_content = gitmodules_path.read_text() + # Convert path to relative to git root + try: + relative_path = path.resolve().relative_to(current_dir) + # Check if this path appears in .gitmodules + return f"path = {relative_path}" in gitmodules_content + except ValueError: + # Path is not under git root + return False + break + current_dir = current_dir.parent + return False + except Exception: + # If anything goes wrong, assume it's not a submodule + return False + def list_indexes(self): print("Stored LEANN indexes:") @@ -264,7 +298,9 @@ Examples: valid_projects.append(current_path) if not valid_projects: - print("No indexes found. Use 'leann build --docs ' to create one.") + print( + "No indexes found. Use 'leann build --docs [ ...]' to create one." + ) return total_indexes = 0 @@ -311,56 +347,88 @@ Examples: print(f' leann search {example_name} "your query"') print(f" leann ask {example_name} --interactive") - def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None): - print(f"Loading documents from {docs_dir}...") + def load_documents( + self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None + ): + # Handle both single path (string) and multiple paths (list) for backward compatibility + if isinstance(docs_paths, str): + docs_paths = [docs_paths] + + # Separate files and directories + files = [] + directories = [] + for path in docs_paths: + path_obj = Path(path) + if path_obj.is_file(): + files.append(str(path_obj)) + elif path_obj.is_dir(): + # Check if this is a git submodule - if so, skip it + if self._is_git_submodule(path_obj): + print(f"⚠️ Skipping git submodule: {path}") + continue + directories.append(str(path_obj)) + else: + print(f"⚠️ Warning: Path '{path}' does not exist, skipping...") + continue + + # Print summary of what we're processing + total_items = len(files) + len(directories) + items_desc = [] + if files: + items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}") + if directories: + items_desc.append( + f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}" + ) + + print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):") + if files: + print(f" 📄 Files: {', '.join([Path(f).name for f in files])}") + if directories: + print(f" 📁 Directories: {', '.join(directories)}") + if custom_file_types: print(f"Using custom file types: {custom_file_types}") - # Build gitignore parser - gitignore_matches = self._build_gitignore_parser(docs_dir) + all_documents = [] - # Try to use better PDF parsers first, but only if PDFs are requested - documents = [] - docs_path = Path(docs_dir) + # First, process individual files if any + if files: + print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...") - # Check if we should process PDFs - should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types + # Load individual files using SimpleDirectoryReader with input_files + # Note: We skip gitignore filtering for explicitly specified files + try: + # Group files by their parent directory for efficient loading + from collections import defaultdict - if should_process_pdfs: - for file_path in docs_path.rglob("*.pdf"): - # Check if file matches any exclude pattern - relative_path = file_path.relative_to(docs_path) - if self._should_exclude_file(relative_path, gitignore_matches): - continue + files_by_dir = defaultdict(list) + for file_path in files: + parent_dir = str(Path(file_path).parent) + files_by_dir[parent_dir].append(file_path) - print(f"Processing PDF: {file_path}") - - # Try PyMuPDF first (best quality) - text = extract_pdf_text_with_pymupdf(str(file_path)) - if text is None: - # Try pdfplumber - text = extract_pdf_text_with_pdfplumber(str(file_path)) - - if text: - # Create a simple document structure - from llama_index.core import Document - - doc = Document(text=text, metadata={"source": str(file_path)}) - documents.append(doc) - else: - # Fallback to default reader - print(f"Using default reader for {file_path}") + # Load files from each parent directory + for parent_dir, file_list in files_by_dir.items(): + print( + f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}" + ) try: - default_docs = SimpleDirectoryReader( - str(file_path.parent), + file_docs = SimpleDirectoryReader( + parent_dir, + input_files=file_list, filename_as_id=True, - required_exts=[file_path.suffix], ).load_data() - documents.extend(default_docs) + all_documents.extend(file_docs) + print( + f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}" + ) except Exception as e: - print(f"Warning: Could not process {file_path}: {e}") + print(f" ❌ Warning: Could not load files from {parent_dir}: {e}") - # Load other file types with default reader + except Exception as e: + print(f"❌ Error processing individual files: {e}") + + # Define file extensions to process if custom_file_types: # Parse custom file types from comma-separated string code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()] @@ -422,41 +490,106 @@ Examples: ".py", ".jl", ] - # Try to load other file types, but don't fail if none are found - try: - # Create a custom file filter function using our PathSpec - def file_filter(file_path: str) -> bool: - """Return True if file should be included (not excluded)""" - try: - docs_path_obj = Path(docs_dir) - file_path_obj = Path(file_path) - relative_path = file_path_obj.relative_to(docs_path_obj) - return not self._should_exclude_file(relative_path, gitignore_matches) - except (ValueError, OSError): - return True # Include files that can't be processed - other_docs = SimpleDirectoryReader( - docs_dir, - recursive=True, - encoding="utf-8", - required_exts=code_extensions, - file_extractor={}, # Use default extractors - filename_as_id=True, - ).load_data(show_progress=True) + # Process each directory + if directories: + print( + f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..." + ) - # Filter documents after loading based on gitignore rules - filtered_docs = [] - for doc in other_docs: - file_path = doc.metadata.get("file_path", "") - if file_filter(file_path): - filtered_docs.append(doc) + for docs_dir in directories: + print(f"Processing directory: {docs_dir}") + # Build gitignore parser for each directory + gitignore_matches = self._build_gitignore_parser(docs_dir) - documents.extend(filtered_docs) - except ValueError as e: - if "No files found" in str(e): - print("No additional files found for other supported types.") - else: - raise e + # Try to use better PDF parsers first, but only if PDFs are requested + documents = [] + docs_path = Path(docs_dir) + + # Check if we should process PDFs + should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types + + if should_process_pdfs: + for file_path in docs_path.rglob("*.pdf"): + # Check if file matches any exclude pattern + try: + relative_path = file_path.relative_to(docs_path) + if self._should_exclude_file(relative_path, gitignore_matches): + continue + except ValueError: + # Skip files that can't be made relative to docs_path + print(f"⚠️ Skipping file outside directory scope: {file_path}") + continue + + print(f"Processing PDF: {file_path}") + + # Try PyMuPDF first (best quality) + text = extract_pdf_text_with_pymupdf(str(file_path)) + if text is None: + # Try pdfplumber + text = extract_pdf_text_with_pdfplumber(str(file_path)) + + if text: + # Create a simple document structure + from llama_index.core import Document + + doc = Document(text=text, metadata={"source": str(file_path)}) + documents.append(doc) + else: + # Fallback to default reader + print(f"Using default reader for {file_path}") + try: + default_docs = SimpleDirectoryReader( + str(file_path.parent), + filename_as_id=True, + required_exts=[file_path.suffix], + ).load_data() + documents.extend(default_docs) + except Exception as e: + print(f"Warning: Could not process {file_path}: {e}") + + # Load other file types with default reader + try: + # Create a custom file filter function using our PathSpec + def file_filter( + file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches + ) -> bool: + """Return True if file should be included (not excluded)""" + try: + docs_path_obj = Path(docs_dir) + file_path_obj = Path(file_path) + relative_path = file_path_obj.relative_to(docs_path_obj) + return not self._should_exclude_file(relative_path, gitignore_matches) + except (ValueError, OSError): + return True # Include files that can't be processed + + other_docs = SimpleDirectoryReader( + docs_dir, + recursive=True, + encoding="utf-8", + required_exts=code_extensions, + file_extractor={}, # Use default extractors + filename_as_id=True, + ).load_data(show_progress=True) + + # Filter documents after loading based on gitignore rules + filtered_docs = [] + for doc in other_docs: + file_path = doc.metadata.get("file_path", "") + if file_filter(file_path): + filtered_docs.append(doc) + + documents.extend(filtered_docs) + except ValueError as e: + if "No files found" in str(e): + print(f"No additional files found for other supported types in {docs_dir}.") + else: + raise e + + all_documents.extend(documents) + print(f"Loaded {len(documents)} documents from {docs_dir}") + + documents = all_documents all_texts = [] @@ -507,7 +640,9 @@ Examples: ".jl", } - for doc in documents: + print("start chunking documents") + # Add progress bar for document chunking + for doc in tqdm(documents, desc="Chunking documents", unit="doc"): # Check if this is a code file based on source path source_path = doc.metadata.get("source", "") is_code_file = any(source_path.endswith(ext) for ext in code_file_exts) @@ -523,7 +658,7 @@ Examples: return all_texts async def build_index(self, args): - docs_dir = args.docs + docs_paths = args.docs # Use current directory name if index_name not provided if args.index_name: index_name = args.index_name @@ -534,13 +669,25 @@ Examples: index_dir = self.indexes_dir / index_name index_path = self.get_index_path(index_name) - print(f"📂 Indexing: {Path(docs_dir).resolve()}") + # Display all paths being indexed with file/directory distinction + files = [p for p in docs_paths if Path(p).is_file()] + directories = [p for p in docs_paths if Path(p).is_dir()] + + print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:") + if files: + print(f" 📄 Files ({len(files)}):") + for i, file_path in enumerate(files, 1): + print(f" {i}. {Path(file_path).resolve()}") + if directories: + print(f" 📁 Directories ({len(directories)}):") + for i, dir_path in enumerate(directories, 1): + print(f" {i}. {Path(dir_path).resolve()}") if index_dir.exists() and not args.force: print(f"Index '{index_name}' already exists. Use --force to rebuild.") return - all_texts = self.load_documents(docs_dir, args.file_types) + all_texts = self.load_documents(docs_paths, args.file_types) if not all_texts: print("No documents found") return @@ -576,7 +723,7 @@ Examples: if not self.index_exists(index_name): print( - f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it." + f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it." ) return @@ -603,7 +750,7 @@ Examples: if not self.index_exists(index_name): print( - f"Index '{index_name}' not found. Use 'leann build {index_name} --docs ' to create it." + f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it." ) return diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index 67f33d1..1a19835 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -6,7 +6,6 @@ Preserves all optimization parameters to ensure performance import logging import os -from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any import numpy as np @@ -374,7 +373,9 @@ def compute_embeddings_ollama( texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434" ) -> np.ndarray: """ - Compute embeddings using Ollama API. + Compute embeddings using Ollama API with simplified batch processing. + + Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance. Args: texts: List of texts to compute embeddings for @@ -438,12 +439,19 @@ def compute_embeddings_ollama( if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]): embedding_models.append(model) - # Check if model exists (handle versioned names) - model_found = any( - model_name == name.split(":")[0] or model_name == name for name in model_names - ) + # Check if model exists (handle versioned names) and resolve to full name + resolved_model_name = None + for name in model_names: + # Exact match + if model_name == name: + resolved_model_name = name + break + # Match without version tag (use the versioned name) + elif model_name == name.split(":")[0]: + resolved_model_name = name + break - if not model_found: + if not resolved_model_name: error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n" # Suggest pulling the model @@ -465,6 +473,11 @@ def compute_embeddings_ollama( error_msg += "\n📚 Browse more: https://ollama.com/library" raise ValueError(error_msg) + # Use the resolved model name for all subsequent operations + if resolved_model_name != model_name: + logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'") + model_name = resolved_model_name + # Verify the model supports embeddings by testing it try: test_response = requests.post( @@ -485,138 +498,148 @@ def compute_embeddings_ollama( except requests.exceptions.RequestException as e: logger.warning(f"Could not verify model existence: {e}") - # Process embeddings with optimized concurrent processing - import requests + # Determine batch size based on device availability + # Check for CUDA/MPS availability using torch if available + batch_size = 32 # Default for MPS/CPU + try: + import torch - def get_single_embedding(text_idx_tuple): - """Helper function to get embedding for a single text.""" - text, idx = text_idx_tuple - max_retries = 3 - retry_count = 0 + if torch.cuda.is_available(): + batch_size = 128 # CUDA gets larger batch size + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + batch_size = 32 # MPS gets smaller batch size + except ImportError: + # If torch is not available, use conservative batch size + batch_size = 32 - # Truncate very long texts to avoid API issues - truncated_text = text[:8000] if len(text) > 8000 else text + logger.info(f"Using batch size: {batch_size}") - while retry_count < max_retries: - try: - response = requests.post( - f"{host}/api/embeddings", - json={"model": model_name, "prompt": truncated_text}, - timeout=30, - ) - response.raise_for_status() + def get_batch_embeddings(batch_texts): + """Get embeddings for a batch of texts.""" + all_embeddings = [] + failed_indices = [] - result = response.json() - embedding = result.get("embedding") + for i, text in enumerate(batch_texts): + max_retries = 3 + retry_count = 0 - if embedding is None: - raise ValueError(f"No embedding returned for text {idx}") - - return idx, embedding - - except requests.exceptions.Timeout: - retry_count += 1 - if retry_count >= max_retries: - logger.warning(f"Timeout for text {idx} after {max_retries} retries") - return idx, None - - except Exception as e: - if retry_count >= max_retries - 1: - logger.error(f"Failed to get embedding for text {idx}: {e}") - return idx, None - retry_count += 1 - - return idx, None - - # Determine if we should use concurrent processing - use_concurrent = ( - len(texts) > 5 and not is_build - ) # Don't use concurrent in build mode to avoid overwhelming - max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama - - all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order - failed_indices = [] - - if use_concurrent: - logger.info( - f"Using concurrent processing with {max_workers} workers for {len(texts)} texts" - ) - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # Submit all tasks - future_to_idx = { - executor.submit(get_single_embedding, (text, idx)): idx - for idx, text in enumerate(texts) - } - - # Add progress bar for concurrent processing - try: - if is_build or len(texts) > 10: - from tqdm import tqdm - - futures_iterator = tqdm( - as_completed(future_to_idx), - total=len(texts), - desc="Computing Ollama embeddings", - ) - else: - futures_iterator = as_completed(future_to_idx) - except ImportError: - futures_iterator = as_completed(future_to_idx) - - # Collect results as they complete - for future in futures_iterator: + # Truncate very long texts to avoid API issues + truncated_text = text[:8000] if len(text) > 8000 else text + while retry_count < max_retries: try: - idx, embedding = future.result() - if embedding is not None: - all_embeddings[idx] = embedding - else: - failed_indices.append(idx) + response = requests.post( + f"{host}/api/embeddings", + json={"model": model_name, "prompt": truncated_text}, + timeout=30, + ) + response.raise_for_status() + + result = response.json() + embedding = result.get("embedding") + + if embedding is None: + raise ValueError(f"No embedding returned for text {i}") + + if not isinstance(embedding, list) or len(embedding) == 0: + raise ValueError(f"Invalid embedding format for text {i}") + + all_embeddings.append(embedding) + break + + except requests.exceptions.Timeout: + retry_count += 1 + if retry_count >= max_retries: + logger.warning(f"Timeout for text {i} after {max_retries} retries") + failed_indices.append(i) + all_embeddings.append(None) + break + except Exception as e: - idx = future_to_idx[future] - logger.error(f"Exception for text {idx}: {e}") - failed_indices.append(idx) + retry_count += 1 + if retry_count >= max_retries: + logger.error(f"Failed to get embedding for text {i}: {e}") + failed_indices.append(i) + all_embeddings.append(None) + break + return all_embeddings, failed_indices + # Process texts in batches + all_embeddings = [] + all_failed_indices = [] + + # Setup progress bar if needed + show_progress = is_build or len(texts) > 10 + try: + if show_progress: + from tqdm import tqdm + except ImportError: + show_progress = False + + # Process batches + num_batches = (len(texts) + batch_size - 1) // batch_size + + if show_progress: + batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings") else: - # Sequential processing with progress bar - show_progress = is_build or len(texts) > 10 + batch_iterator = range(num_batches) - try: - if show_progress: - from tqdm import tqdm + for batch_idx in batch_iterator: + start_idx = batch_idx * batch_size + end_idx = min(start_idx + batch_size, len(texts)) + batch_texts = texts[start_idx:end_idx] - iterator = tqdm( - enumerate(texts), total=len(texts), desc="Computing Ollama embeddings" - ) - else: - iterator = enumerate(texts) - except ImportError: - iterator = enumerate(texts) + batch_embeddings, batch_failed = get_batch_embeddings(batch_texts) - for idx, text in iterator: - result_idx, embedding = get_single_embedding((text, idx)) - if embedding is not None: - all_embeddings[idx] = embedding - else: - failed_indices.append(idx) + # Adjust failed indices to global indices + global_failed = [start_idx + idx for idx in batch_failed] + all_failed_indices.extend(global_failed) + all_embeddings.extend(batch_embeddings) # Handle failed embeddings - if failed_indices: - if len(failed_indices) == len(texts): + if all_failed_indices: + if len(all_failed_indices) == len(texts): raise RuntimeError("Failed to compute any embeddings") - logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts") + logger.warning( + f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts" + ) # Use zero embeddings as fallback for failed ones valid_embedding = next((e for e in all_embeddings if e is not None), None) if valid_embedding: embedding_dim = len(valid_embedding) - for idx in failed_indices: - all_embeddings[idx] = [0.0] * embedding_dim + for i, embedding in enumerate(all_embeddings): + if embedding is None: + all_embeddings[i] = [0.0] * embedding_dim - # Remove None values and convert to numpy array + # Remove None values all_embeddings = [e for e in all_embeddings if e is not None] + if not all_embeddings: + raise RuntimeError("No valid embeddings were computed") + + # Validate embedding dimensions + expected_dim = len(all_embeddings[0]) + inconsistent_dims = [] + for i, embedding in enumerate(all_embeddings): + if len(embedding) != expected_dim: + inconsistent_dims.append((i, len(embedding))) + + if inconsistent_dims: + error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n" + for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones + error_msg += f" - Text {idx}: {dim} dimensions\n" + if len(inconsistent_dims) > 10: + error_msg += f" ... and {len(inconsistent_dims) - 10} more\n" + error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n" + error_msg += "1. Restart Ollama service: 'ollama serve'\n" + error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n" + error_msg += ( + "3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n" + ) + error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues" + raise ValueError(error_msg) + # Convert to numpy array and normalize embeddings = np.array(all_embeddings, dtype=np.float32) diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md index b762ae9..eb86488 100644 --- a/packages/leann-mcp/README.md +++ b/packages/leann-mcp/README.md @@ -45,6 +45,42 @@ leann build my-project --docs ./ claude ``` +## 🚀 Advanced Usage Examples + +### Index Entire Git Repository +```bash +# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want +leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# Index only specific file types from git +leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw +``` + +### Multiple Directories and Files +```bash +# Index multiple directories +leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# Mix files and directories +leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# Specific files only +leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw +``` + +### Advanced Git Integration +```bash +# Index recently modified files +leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# Index files matching pattern +leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw + +# Index documentation and config files +leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw +``` + + **Try this in Claude Code:** ``` Help me understand this codebase. List available indexes and search for authentication patterns. diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index 52fbeb5..2949afa 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann" -version = "0.2.7" +version = "0.2.8" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" readme = "README.md" requires-python = ">=3.9"