Compare commits

..

2 Commits

Author SHA1 Message Date
yichuan520030910320
2c3824e7b6 feat: support multiple input formats for --docs argument
- Add support for multiple directories: --docs ./src ./tests ./config
- Add support for individual files: --docs ./file1.py ./file2.txt
- Add support for mixed files and directories: --docs ./README.md ./src/ ./config.json
- Add git ls-files integration: --docs $(git ls-files)
- Add git submodule detection and skip logic to avoid indexing third-party dependencies
- Add comprehensive error handling for path resolution issues
- Update MCP README with advanced usage examples including git integration
- Fix ruff linting issues with closure variable binding

Breaking changes: None - fully backward compatible with existing single directory usage

Examples:
  leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
  leann build my-code --docs ./src ./tests ./docs --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
  leann build my-configs --docs ./package.json ./tsconfig.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
2025-08-12 02:04:44 -07:00
yichuan520030910320
b2390ccc14 [Ollama] fix ollama recompute 2025-08-12 00:24:20 -07:00
6 changed files with 410 additions and 268 deletions

View File

@@ -64,16 +64,6 @@ jobs:
python: '3.12' python: '3.12'
- os: macos-14 - os: macos-14
python: '3.13' python: '3.13'
- os: macos-15
python: '3.9'
- os: macos-15
python: '3.10'
- os: macos-15
python: '3.11'
- os: macos-15
python: '3.12'
- os: macos-15
python: '3.13'
- os: macos-13 - os: macos-13
python: '3.9' python: '3.9'
- os: macos-13 - os: macos-13
@@ -157,14 +147,7 @@ jobs:
# Use system clang for better compatibility # Use system clang for better compatibility
export CC=clang export CC=clang
export CXX=clang++ export CXX=clang++
# Homebrew libraries on each macOS version require matching minimum version export MACOSX_DEPLOYMENT_TARGET=11.0
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export MACOSX_DEPLOYMENT_TARGET=13.0
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
export MACOSX_DEPLOYMENT_TARGET=14.0
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
@@ -178,14 +161,7 @@ jobs:
export CC=clang export CC=clang
export CXX=clang++ export CXX=clang++
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function # DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
# But Homebrew libraries on each macOS version require matching minimum version
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export MACOSX_DEPLOYMENT_TARGET=13.3 export MACOSX_DEPLOYMENT_TARGET=13.3
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
export MACOSX_DEPLOYMENT_TARGET=14.0
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
@@ -221,24 +197,10 @@ jobs:
- name: Repair wheels (macOS) - name: Repair wheels (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'
run: | run: |
# Determine deployment target based on runner OS
# Must match the Homebrew libraries for each macOS version
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
HNSW_TARGET="13.0"
DISKANN_TARGET="13.3"
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
HNSW_TARGET="14.0"
DISKANN_TARGET="14.0"
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
HNSW_TARGET="15.0"
DISKANN_TARGET="15.0"
fi
# Repair HNSW wheel # Repair HNSW wheel
cd packages/leann-backend-hnsw cd packages/leann-backend-hnsw
if [ -d dist ]; then if [ -d dist ]; then
export MACOSX_DEPLOYMENT_TARGET=$HNSW_TARGET delocate-wheel -w dist_repaired -v dist/*.whl
delocate-wheel -w dist_repaired -v --require-target-macos-version $HNSW_TARGET dist/*.whl
rm -rf dist rm -rf dist
mv dist_repaired dist mv dist_repaired dist
fi fi
@@ -247,8 +209,7 @@ jobs:
# Repair DiskANN wheel # Repair DiskANN wheel
cd packages/leann-backend-diskann cd packages/leann-backend-diskann
if [ -d dist ]; then if [ -d dist ]; then
export MACOSX_DEPLOYMENT_TARGET=$DISKANN_TARGET delocate-wheel -w dist_repaired -v dist/*.whl
delocate-wheel -w dist_repaired -v --require-target-macos-version $DISKANN_TARGET dist/*.whl
rm -rf dist rm -rf dist
mv dist_repaired dist mv dist_repaired dist
fi fi
@@ -288,8 +249,8 @@ jobs:
# Activate virtual environment # Activate virtual environment
source .venv/bin/activate || source .venv/Scripts/activate source .venv/bin/activate || source .venv/Scripts/activate
# Run tests # Run all tests
pytest -v tests/ pytest tests/
- name: Run sanity checks (optional) - name: Run sanity checks (optional)
run: | run: |

View File

@@ -468,7 +468,7 @@ leann --help
### Usage Examples ### Usage Examples
```bash ```bash
# build from a specific directory, and my_docs is the index name # build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
leann build my-docs --docs ./your_documents leann build my-docs --docs ./your_documents
# Search your documents # Search your documents

View File

@@ -5,6 +5,7 @@ from typing import Union
from llama_index.core import SimpleDirectoryReader from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
from tqdm import tqdm
from .api import LeannBuilder, LeannChat, LeannSearcher from .api import LeannBuilder, LeannChat, LeannSearcher
@@ -75,7 +76,10 @@ class LeannCLI:
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
leann build my-docs --docs ./documents # Build index named my-docs leann build my-docs --docs ./documents # Build index from directory
leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
leann search my-docs "query" # Search in my-docs index leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index leann ask my-docs "question" # Ask my-docs index
@@ -91,7 +95,11 @@ Examples:
"index_name", nargs="?", help="Index name (default: current directory name)" "index_name", nargs="?", help="Index name (default: current directory name)"
) )
build_parser.add_argument( build_parser.add_argument(
"--docs", type=str, default=".", help="Documents directory (default: current directory)" "--docs",
type=str,
nargs="+",
default=["."],
help="Documents directories and/or files (default: current directory)",
) )
build_parser.add_argument( build_parser.add_argument(
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"] "--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
@@ -235,6 +243,32 @@ Examples:
"""Check if a file should be excluded using gitignore parser.""" """Check if a file should be excluded using gitignore parser."""
return gitignore_matches(str(relative_path)) return gitignore_matches(str(relative_path))
def _is_git_submodule(self, path: Path) -> bool:
"""Check if a path is a git submodule."""
try:
# Find the git repo root
current_dir = Path.cwd()
while current_dir != current_dir.parent:
if (current_dir / ".git").exists():
gitmodules_path = current_dir / ".gitmodules"
if gitmodules_path.exists():
# Read .gitmodules to check if this path is a submodule
gitmodules_content = gitmodules_path.read_text()
# Convert path to relative to git root
try:
relative_path = path.resolve().relative_to(current_dir)
# Check if this path appears in .gitmodules
return f"path = {relative_path}" in gitmodules_content
except ValueError:
# Path is not under git root
return False
break
current_dir = current_dir.parent
return False
except Exception:
# If anything goes wrong, assume it's not a submodule
return False
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:") print("Stored LEANN indexes:")
@@ -264,7 +298,9 @@ Examples:
valid_projects.append(current_path) valid_projects.append(current_path)
if not valid_projects: if not valid_projects:
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.") print(
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
)
return return
total_indexes = 0 total_indexes = 0
@@ -311,56 +347,88 @@ Examples:
print(f' leann search {example_name} "your query"') print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive") print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None): def load_documents(
print(f"Loading documents from {docs_dir}...") self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
):
# Handle both single path (string) and multiple paths (list) for backward compatibility
if isinstance(docs_paths, str):
docs_paths = [docs_paths]
# Separate files and directories
files = []
directories = []
for path in docs_paths:
path_obj = Path(path)
if path_obj.is_file():
files.append(str(path_obj))
elif path_obj.is_dir():
# Check if this is a git submodule - if so, skip it
if self._is_git_submodule(path_obj):
print(f"⚠️ Skipping git submodule: {path}")
continue
directories.append(str(path_obj))
else:
print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
continue
# Print summary of what we're processing
total_items = len(files) + len(directories)
items_desc = []
if files:
items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
if directories:
items_desc.append(
f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
)
print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
if files:
print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
if directories:
print(f" 📁 Directories: {', '.join(directories)}")
if custom_file_types: if custom_file_types:
print(f"Using custom file types: {custom_file_types}") print(f"Using custom file types: {custom_file_types}")
# Build gitignore parser all_documents = []
gitignore_matches = self._build_gitignore_parser(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested # First, process individual files if any
documents = [] if files:
docs_path = Path(docs_dir) print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
# Check if we should process PDFs # Load individual files using SimpleDirectoryReader with input_files
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types # Note: We skip gitignore filtering for explicitly specified files
if should_process_pdfs:
for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern
relative_path = file_path.relative_to(docs_path)
if self._should_exclude_file(relative_path, gitignore_matches):
continue
print(f"Processing PDF: {file_path}")
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if text:
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
try: try:
default_docs = SimpleDirectoryReader( # Group files by their parent directory for efficient loading
str(file_path.parent), from collections import defaultdict
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
except Exception as e:
print(f"Warning: Could not process {file_path}: {e}")
# Load other file types with default reader files_by_dir = defaultdict(list)
for file_path in files:
parent_dir = str(Path(file_path).parent)
files_by_dir[parent_dir].append(file_path)
# Load files from each parent directory
for parent_dir, file_list in files_by_dir.items():
print(
f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
)
try:
file_docs = SimpleDirectoryReader(
parent_dir,
input_files=file_list,
filename_as_id=True,
).load_data()
all_documents.extend(file_docs)
print(
f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
)
except Exception as e:
print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
except Exception as e:
print(f"❌ Error processing individual files: {e}")
# Define file extensions to process
if custom_file_types: if custom_file_types:
# Parse custom file types from comma-separated string # Parse custom file types from comma-separated string
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()] code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
@@ -422,10 +490,70 @@ Examples:
".py", ".py",
".jl", ".jl",
] ]
# Try to load other file types, but don't fail if none are found
# Process each directory
if directories:
print(
f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
)
for docs_dir in directories:
print(f"Processing directory: {docs_dir}")
# Build gitignore parser for each directory
gitignore_matches = self._build_gitignore_parser(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested
documents = []
docs_path = Path(docs_dir)
# Check if we should process PDFs
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
if should_process_pdfs:
for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern
try:
relative_path = file_path.relative_to(docs_path)
if self._should_exclude_file(relative_path, gitignore_matches):
continue
except ValueError:
# Skip files that can't be made relative to docs_path
print(f"⚠️ Skipping file outside directory scope: {file_path}")
continue
print(f"Processing PDF: {file_path}")
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if text:
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
try:
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
except Exception as e:
print(f"Warning: Could not process {file_path}: {e}")
# Load other file types with default reader
try: try:
# Create a custom file filter function using our PathSpec # Create a custom file filter function using our PathSpec
def file_filter(file_path: str) -> bool: def file_filter(
file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
) -> bool:
"""Return True if file should be included (not excluded)""" """Return True if file should be included (not excluded)"""
try: try:
docs_path_obj = Path(docs_dir) docs_path_obj = Path(docs_dir)
@@ -454,10 +582,15 @@ Examples:
documents.extend(filtered_docs) documents.extend(filtered_docs)
except ValueError as e: except ValueError as e:
if "No files found" in str(e): if "No files found" in str(e):
print("No additional files found for other supported types.") print(f"No additional files found for other supported types in {docs_dir}.")
else: else:
raise e raise e
all_documents.extend(documents)
print(f"Loaded {len(documents)} documents from {docs_dir}")
documents = all_documents
all_texts = [] all_texts = []
# Define code file extensions for intelligent chunking # Define code file extensions for intelligent chunking
@@ -507,7 +640,9 @@ Examples:
".jl", ".jl",
} }
for doc in documents: print("start chunking documents")
# Add progress bar for document chunking
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
# Check if this is a code file based on source path # Check if this is a code file based on source path
source_path = doc.metadata.get("source", "") source_path = doc.metadata.get("source", "")
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts) is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
@@ -523,7 +658,7 @@ Examples:
return all_texts return all_texts
async def build_index(self, args): async def build_index(self, args):
docs_dir = args.docs docs_paths = args.docs
# Use current directory name if index_name not provided # Use current directory name if index_name not provided
if args.index_name: if args.index_name:
index_name = args.index_name index_name = args.index_name
@@ -534,13 +669,25 @@ Examples:
index_dir = self.indexes_dir / index_name index_dir = self.indexes_dir / index_name
index_path = self.get_index_path(index_name) index_path = self.get_index_path(index_name)
print(f"📂 Indexing: {Path(docs_dir).resolve()}") # Display all paths being indexed with file/directory distinction
files = [p for p in docs_paths if Path(p).is_file()]
directories = [p for p in docs_paths if Path(p).is_dir()]
print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
if files:
print(f" 📄 Files ({len(files)}):")
for i, file_path in enumerate(files, 1):
print(f" {i}. {Path(file_path).resolve()}")
if directories:
print(f" 📁 Directories ({len(directories)}):")
for i, dir_path in enumerate(directories, 1):
print(f" {i}. {Path(dir_path).resolve()}")
if index_dir.exists() and not args.force: if index_dir.exists() and not args.force:
print(f"Index '{index_name}' already exists. Use --force to rebuild.") print(f"Index '{index_name}' already exists. Use --force to rebuild.")
return return
all_texts = self.load_documents(docs_dir, args.file_types) all_texts = self.load_documents(docs_paths, args.file_types)
if not all_texts: if not all_texts:
print("No documents found") print("No documents found")
return return
@@ -576,7 +723,7 @@ Examples:
if not self.index_exists(index_name): if not self.index_exists(index_name):
print( print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it." f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
) )
return return
@@ -603,7 +750,7 @@ Examples:
if not self.index_exists(index_name): if not self.index_exists(index_name):
print( print(
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it." f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
) )
return return

View File

@@ -6,7 +6,6 @@ Preserves all optimization parameters to ensure performance
import logging import logging
import os import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any from typing import Any
import numpy as np import numpy as np
@@ -374,7 +373,9 @@ def compute_embeddings_ollama(
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434" texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
) -> np.ndarray: ) -> np.ndarray:
""" """
Compute embeddings using Ollama API. Compute embeddings using Ollama API with simplified batch processing.
Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
Args: Args:
texts: List of texts to compute embeddings for texts: List of texts to compute embeddings for
@@ -438,12 +439,19 @@ def compute_embeddings_ollama(
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]): if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
embedding_models.append(model) embedding_models.append(model)
# Check if model exists (handle versioned names) # Check if model exists (handle versioned names) and resolve to full name
model_found = any( resolved_model_name = None
model_name == name.split(":")[0] or model_name == name for name in model_names for name in model_names:
) # Exact match
if model_name == name:
resolved_model_name = name
break
# Match without version tag (use the versioned name)
elif model_name == name.split(":")[0]:
resolved_model_name = name
break
if not model_found: if not resolved_model_name:
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n" error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
# Suggest pulling the model # Suggest pulling the model
@@ -465,6 +473,11 @@ def compute_embeddings_ollama(
error_msg += "\n📚 Browse more: https://ollama.com/library" error_msg += "\n📚 Browse more: https://ollama.com/library"
raise ValueError(error_msg) raise ValueError(error_msg)
# Use the resolved model name for all subsequent operations
if resolved_model_name != model_name:
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
model_name = resolved_model_name
# Verify the model supports embeddings by testing it # Verify the model supports embeddings by testing it
try: try:
test_response = requests.post( test_response = requests.post(
@@ -485,18 +498,33 @@ def compute_embeddings_ollama(
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.warning(f"Could not verify model existence: {e}") logger.warning(f"Could not verify model existence: {e}")
# Process embeddings with optimized concurrent processing # Determine batch size based on device availability
import requests # Check for CUDA/MPS availability using torch if available
batch_size = 32 # Default for MPS/CPU
try:
import torch
def get_single_embedding(text_idx_tuple): if torch.cuda.is_available():
"""Helper function to get embedding for a single text.""" batch_size = 128 # CUDA gets larger batch size
text, idx = text_idx_tuple elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
batch_size = 32 # MPS gets smaller batch size
except ImportError:
# If torch is not available, use conservative batch size
batch_size = 32
logger.info(f"Using batch size: {batch_size}")
def get_batch_embeddings(batch_texts):
"""Get embeddings for a batch of texts."""
all_embeddings = []
failed_indices = []
for i, text in enumerate(batch_texts):
max_retries = 3 max_retries = 3
retry_count = 0 retry_count = 0
# Truncate very long texts to avoid API issues # Truncate very long texts to avoid API issues
truncated_text = text[:8000] if len(text) > 8000 else text truncated_text = text[:8000] if len(text) > 8000 else text
while retry_count < max_retries: while retry_count < max_retries:
try: try:
response = requests.post( response = requests.post(
@@ -510,115 +538,87 @@ def compute_embeddings_ollama(
embedding = result.get("embedding") embedding = result.get("embedding")
if embedding is None: if embedding is None:
raise ValueError(f"No embedding returned for text {idx}") raise ValueError(f"No embedding returned for text {i}")
return idx, embedding if not isinstance(embedding, list) or len(embedding) == 0:
raise ValueError(f"Invalid embedding format for text {i}")
all_embeddings.append(embedding)
break
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
retry_count += 1 retry_count += 1
if retry_count >= max_retries: if retry_count >= max_retries:
logger.warning(f"Timeout for text {idx} after {max_retries} retries") logger.warning(f"Timeout for text {i} after {max_retries} retries")
return idx, None failed_indices.append(i)
all_embeddings.append(None)
break
except Exception as e: except Exception as e:
if retry_count >= max_retries - 1:
logger.error(f"Failed to get embedding for text {idx}: {e}")
return idx, None
retry_count += 1 retry_count += 1
if retry_count >= max_retries:
logger.error(f"Failed to get embedding for text {i}: {e}")
failed_indices.append(i)
all_embeddings.append(None)
break
return all_embeddings, failed_indices
return idx, None # Process texts in batches
all_embeddings = []
all_failed_indices = []
# Determine if we should use concurrent processing # Setup progress bar if needed
use_concurrent = (
len(texts) > 5 and not is_build
) # Don't use concurrent in build mode to avoid overwhelming
max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama
all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order
failed_indices = []
if use_concurrent:
logger.info(
f"Using concurrent processing with {max_workers} workers for {len(texts)} texts"
)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_idx = {
executor.submit(get_single_embedding, (text, idx)): idx
for idx, text in enumerate(texts)
}
# Add progress bar for concurrent processing
try:
if is_build or len(texts) > 10:
from tqdm import tqdm
futures_iterator = tqdm(
as_completed(future_to_idx),
total=len(texts),
desc="Computing Ollama embeddings",
)
else:
futures_iterator = as_completed(future_to_idx)
except ImportError:
futures_iterator = as_completed(future_to_idx)
# Collect results as they complete
for future in futures_iterator:
try:
idx, embedding = future.result()
if embedding is not None:
all_embeddings[idx] = embedding
else:
failed_indices.append(idx)
except Exception as e:
idx = future_to_idx[future]
logger.error(f"Exception for text {idx}: {e}")
failed_indices.append(idx)
else:
# Sequential processing with progress bar
show_progress = is_build or len(texts) > 10 show_progress = is_build or len(texts) > 10
try: try:
if show_progress: if show_progress:
from tqdm import tqdm from tqdm import tqdm
iterator = tqdm(
enumerate(texts), total=len(texts), desc="Computing Ollama embeddings"
)
else:
iterator = enumerate(texts)
except ImportError: except ImportError:
iterator = enumerate(texts) show_progress = False
for idx, text in iterator: # Process batches
result_idx, embedding = get_single_embedding((text, idx)) num_batches = (len(texts) + batch_size - 1) // batch_size
if embedding is not None:
all_embeddings[idx] = embedding if show_progress:
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
else: else:
failed_indices.append(idx) batch_iterator = range(num_batches)
for batch_idx in batch_iterator:
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(texts))
batch_texts = texts[start_idx:end_idx]
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
# Adjust failed indices to global indices
global_failed = [start_idx + idx for idx in batch_failed]
all_failed_indices.extend(global_failed)
all_embeddings.extend(batch_embeddings)
# Handle failed embeddings # Handle failed embeddings
if failed_indices: if all_failed_indices:
if len(failed_indices) == len(texts): if len(all_failed_indices) == len(texts):
raise RuntimeError("Failed to compute any embeddings") raise RuntimeError("Failed to compute any embeddings")
logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts") logger.warning(
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
)
# Use zero embeddings as fallback for failed ones # Use zero embeddings as fallback for failed ones
valid_embedding = next((e for e in all_embeddings if e is not None), None) valid_embedding = next((e for e in all_embeddings if e is not None), None)
if valid_embedding: if valid_embedding:
embedding_dim = len(valid_embedding) embedding_dim = len(valid_embedding)
for idx in failed_indices: for i, embedding in enumerate(all_embeddings):
all_embeddings[idx] = [0.0] * embedding_dim if embedding is None:
all_embeddings[i] = [0.0] * embedding_dim
# Remove None values and convert to numpy array # Remove None values
all_embeddings = [e for e in all_embeddings if e is not None] all_embeddings = [e for e in all_embeddings if e is not None]
# Validate embedding dimensions before creating numpy array if not all_embeddings:
if all_embeddings: raise RuntimeError("No valid embeddings were computed")
# Validate embedding dimensions
expected_dim = len(all_embeddings[0]) expected_dim = len(all_embeddings[0])
inconsistent_dims = [] inconsistent_dims = []
for i, embedding in enumerate(all_embeddings): for i, embedding in enumerate(all_embeddings):
@@ -631,9 +631,7 @@ def compute_embeddings_ollama(
error_msg += f" - Text {idx}: {dim} dimensions\n" error_msg += f" - Text {idx}: {dim} dimensions\n"
if len(inconsistent_dims) > 10: if len(inconsistent_dims) > 10:
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n" error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
error_msg += ( error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
)
error_msg += "1. Restart Ollama service: 'ollama serve'\n" error_msg += "1. Restart Ollama service: 'ollama serve'\n"
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n" error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
error_msg += ( error_msg += (

View File

@@ -45,6 +45,42 @@ leann build my-project --docs ./
claude claude
``` ```
## 🚀 Advanced Usage Examples
### Index Entire Git Repository
```bash
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Index only specific file types from git
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
```
### Multiple Directories and Files
```bash
# Index multiple directories
leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Mix files and directories
leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Specific files only
leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
```
### Advanced Git Integration
```bash
# Index recently modified files
leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Index files matching pattern
leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
# Index documentation and config files
leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
```
**Try this in Claude Code:** **Try this in Claude Code:**
``` ```
Help me understand this codebase. List available indexes and search for authentication patterns. Help me understand this codebase. List available indexes and search for authentication patterns.