Compare commits
2 Commits
add-macos1
...
feat/multi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2c3824e7b6 | ||
|
|
b2390ccc14 |
@@ -468,7 +468,7 @@ leann --help
|
|||||||
### Usage Examples
|
### Usage Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# build from a specific directory, and my_docs is the index name
|
# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
|
||||||
leann build my-docs --docs ./your_documents
|
leann build my-docs --docs ./your_documents
|
||||||
|
|
||||||
# Search your documents
|
# Search your documents
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import Union
|
|||||||
|
|
||||||
from llama_index.core import SimpleDirectoryReader
|
from llama_index.core import SimpleDirectoryReader
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||||
|
|
||||||
@@ -75,11 +76,14 @@ class LeannCLI:
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
leann build my-docs --docs ./documents # Build index named my-docs
|
leann build my-docs --docs ./documents # Build index from directory
|
||||||
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
|
||||||
leann search my-docs "query" # Search in my-docs index
|
leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
|
||||||
leann ask my-docs "question" # Ask my-docs index
|
leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
|
||||||
leann list # List all stored indexes
|
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||||
|
leann search my-docs "query" # Search in my-docs index
|
||||||
|
leann ask my-docs "question" # Ask my-docs index
|
||||||
|
leann list # List all stored indexes
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -91,7 +95,11 @@ Examples:
|
|||||||
"index_name", nargs="?", help="Index name (default: current directory name)"
|
"index_name", nargs="?", help="Index name (default: current directory name)"
|
||||||
)
|
)
|
||||||
build_parser.add_argument(
|
build_parser.add_argument(
|
||||||
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
|
"--docs",
|
||||||
|
type=str,
|
||||||
|
nargs="+",
|
||||||
|
default=["."],
|
||||||
|
help="Documents directories and/or files (default: current directory)",
|
||||||
)
|
)
|
||||||
build_parser.add_argument(
|
build_parser.add_argument(
|
||||||
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
||||||
@@ -235,6 +243,32 @@ Examples:
|
|||||||
"""Check if a file should be excluded using gitignore parser."""
|
"""Check if a file should be excluded using gitignore parser."""
|
||||||
return gitignore_matches(str(relative_path))
|
return gitignore_matches(str(relative_path))
|
||||||
|
|
||||||
|
def _is_git_submodule(self, path: Path) -> bool:
|
||||||
|
"""Check if a path is a git submodule."""
|
||||||
|
try:
|
||||||
|
# Find the git repo root
|
||||||
|
current_dir = Path.cwd()
|
||||||
|
while current_dir != current_dir.parent:
|
||||||
|
if (current_dir / ".git").exists():
|
||||||
|
gitmodules_path = current_dir / ".gitmodules"
|
||||||
|
if gitmodules_path.exists():
|
||||||
|
# Read .gitmodules to check if this path is a submodule
|
||||||
|
gitmodules_content = gitmodules_path.read_text()
|
||||||
|
# Convert path to relative to git root
|
||||||
|
try:
|
||||||
|
relative_path = path.resolve().relative_to(current_dir)
|
||||||
|
# Check if this path appears in .gitmodules
|
||||||
|
return f"path = {relative_path}" in gitmodules_content
|
||||||
|
except ValueError:
|
||||||
|
# Path is not under git root
|
||||||
|
return False
|
||||||
|
break
|
||||||
|
current_dir = current_dir.parent
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
# If anything goes wrong, assume it's not a submodule
|
||||||
|
return False
|
||||||
|
|
||||||
def list_indexes(self):
|
def list_indexes(self):
|
||||||
print("Stored LEANN indexes:")
|
print("Stored LEANN indexes:")
|
||||||
|
|
||||||
@@ -264,7 +298,9 @@ Examples:
|
|||||||
valid_projects.append(current_path)
|
valid_projects.append(current_path)
|
||||||
|
|
||||||
if not valid_projects:
|
if not valid_projects:
|
||||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
print(
|
||||||
|
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
total_indexes = 0
|
total_indexes = 0
|
||||||
@@ -311,56 +347,88 @@ Examples:
|
|||||||
print(f' leann search {example_name} "your query"')
|
print(f' leann search {example_name} "your query"')
|
||||||
print(f" leann ask {example_name} --interactive")
|
print(f" leann ask {example_name} --interactive")
|
||||||
|
|
||||||
def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None):
|
def load_documents(
|
||||||
print(f"Loading documents from {docs_dir}...")
|
self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
|
||||||
|
):
|
||||||
|
# Handle both single path (string) and multiple paths (list) for backward compatibility
|
||||||
|
if isinstance(docs_paths, str):
|
||||||
|
docs_paths = [docs_paths]
|
||||||
|
|
||||||
|
# Separate files and directories
|
||||||
|
files = []
|
||||||
|
directories = []
|
||||||
|
for path in docs_paths:
|
||||||
|
path_obj = Path(path)
|
||||||
|
if path_obj.is_file():
|
||||||
|
files.append(str(path_obj))
|
||||||
|
elif path_obj.is_dir():
|
||||||
|
# Check if this is a git submodule - if so, skip it
|
||||||
|
if self._is_git_submodule(path_obj):
|
||||||
|
print(f"⚠️ Skipping git submodule: {path}")
|
||||||
|
continue
|
||||||
|
directories.append(str(path_obj))
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Print summary of what we're processing
|
||||||
|
total_items = len(files) + len(directories)
|
||||||
|
items_desc = []
|
||||||
|
if files:
|
||||||
|
items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
|
||||||
|
if directories:
|
||||||
|
items_desc.append(
|
||||||
|
f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
|
||||||
|
if files:
|
||||||
|
print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
|
||||||
|
if directories:
|
||||||
|
print(f" 📁 Directories: {', '.join(directories)}")
|
||||||
|
|
||||||
if custom_file_types:
|
if custom_file_types:
|
||||||
print(f"Using custom file types: {custom_file_types}")
|
print(f"Using custom file types: {custom_file_types}")
|
||||||
|
|
||||||
# Build gitignore parser
|
all_documents = []
|
||||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
|
||||||
|
|
||||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
# First, process individual files if any
|
||||||
documents = []
|
if files:
|
||||||
docs_path = Path(docs_dir)
|
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
|
||||||
|
|
||||||
# Check if we should process PDFs
|
# Load individual files using SimpleDirectoryReader with input_files
|
||||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
# Note: We skip gitignore filtering for explicitly specified files
|
||||||
|
try:
|
||||||
|
# Group files by their parent directory for efficient loading
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
if should_process_pdfs:
|
files_by_dir = defaultdict(list)
|
||||||
for file_path in docs_path.rglob("*.pdf"):
|
for file_path in files:
|
||||||
# Check if file matches any exclude pattern
|
parent_dir = str(Path(file_path).parent)
|
||||||
relative_path = file_path.relative_to(docs_path)
|
files_by_dir[parent_dir].append(file_path)
|
||||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"Processing PDF: {file_path}")
|
# Load files from each parent directory
|
||||||
|
for parent_dir, file_list in files_by_dir.items():
|
||||||
# Try PyMuPDF first (best quality)
|
print(
|
||||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
|
||||||
if text is None:
|
)
|
||||||
# Try pdfplumber
|
|
||||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
|
||||||
|
|
||||||
if text:
|
|
||||||
# Create a simple document structure
|
|
||||||
from llama_index.core import Document
|
|
||||||
|
|
||||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
|
||||||
documents.append(doc)
|
|
||||||
else:
|
|
||||||
# Fallback to default reader
|
|
||||||
print(f"Using default reader for {file_path}")
|
|
||||||
try:
|
try:
|
||||||
default_docs = SimpleDirectoryReader(
|
file_docs = SimpleDirectoryReader(
|
||||||
str(file_path.parent),
|
parent_dir,
|
||||||
|
input_files=file_list,
|
||||||
filename_as_id=True,
|
filename_as_id=True,
|
||||||
required_exts=[file_path.suffix],
|
|
||||||
).load_data()
|
).load_data()
|
||||||
documents.extend(default_docs)
|
all_documents.extend(file_docs)
|
||||||
|
print(
|
||||||
|
f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Warning: Could not process {file_path}: {e}")
|
print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
|
||||||
|
|
||||||
# Load other file types with default reader
|
except Exception as e:
|
||||||
|
print(f"❌ Error processing individual files: {e}")
|
||||||
|
|
||||||
|
# Define file extensions to process
|
||||||
if custom_file_types:
|
if custom_file_types:
|
||||||
# Parse custom file types from comma-separated string
|
# Parse custom file types from comma-separated string
|
||||||
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
||||||
@@ -422,41 +490,106 @@ Examples:
|
|||||||
".py",
|
".py",
|
||||||
".jl",
|
".jl",
|
||||||
]
|
]
|
||||||
# Try to load other file types, but don't fail if none are found
|
|
||||||
try:
|
|
||||||
# Create a custom file filter function using our PathSpec
|
|
||||||
def file_filter(file_path: str) -> bool:
|
|
||||||
"""Return True if file should be included (not excluded)"""
|
|
||||||
try:
|
|
||||||
docs_path_obj = Path(docs_dir)
|
|
||||||
file_path_obj = Path(file_path)
|
|
||||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
|
||||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
|
||||||
except (ValueError, OSError):
|
|
||||||
return True # Include files that can't be processed
|
|
||||||
|
|
||||||
other_docs = SimpleDirectoryReader(
|
# Process each directory
|
||||||
docs_dir,
|
if directories:
|
||||||
recursive=True,
|
print(
|
||||||
encoding="utf-8",
|
f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
|
||||||
required_exts=code_extensions,
|
)
|
||||||
file_extractor={}, # Use default extractors
|
|
||||||
filename_as_id=True,
|
|
||||||
).load_data(show_progress=True)
|
|
||||||
|
|
||||||
# Filter documents after loading based on gitignore rules
|
for docs_dir in directories:
|
||||||
filtered_docs = []
|
print(f"Processing directory: {docs_dir}")
|
||||||
for doc in other_docs:
|
# Build gitignore parser for each directory
|
||||||
file_path = doc.metadata.get("file_path", "")
|
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||||
if file_filter(file_path):
|
|
||||||
filtered_docs.append(doc)
|
|
||||||
|
|
||||||
documents.extend(filtered_docs)
|
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||||
except ValueError as e:
|
documents = []
|
||||||
if "No files found" in str(e):
|
docs_path = Path(docs_dir)
|
||||||
print("No additional files found for other supported types.")
|
|
||||||
else:
|
# Check if we should process PDFs
|
||||||
raise e
|
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||||
|
|
||||||
|
if should_process_pdfs:
|
||||||
|
for file_path in docs_path.rglob("*.pdf"):
|
||||||
|
# Check if file matches any exclude pattern
|
||||||
|
try:
|
||||||
|
relative_path = file_path.relative_to(docs_path)
|
||||||
|
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
# Skip files that can't be made relative to docs_path
|
||||||
|
print(f"⚠️ Skipping file outside directory scope: {file_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Processing PDF: {file_path}")
|
||||||
|
|
||||||
|
# Try PyMuPDF first (best quality)
|
||||||
|
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||||
|
if text is None:
|
||||||
|
# Try pdfplumber
|
||||||
|
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Create a simple document structure
|
||||||
|
from llama_index.core import Document
|
||||||
|
|
||||||
|
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||||
|
documents.append(doc)
|
||||||
|
else:
|
||||||
|
# Fallback to default reader
|
||||||
|
print(f"Using default reader for {file_path}")
|
||||||
|
try:
|
||||||
|
default_docs = SimpleDirectoryReader(
|
||||||
|
str(file_path.parent),
|
||||||
|
filename_as_id=True,
|
||||||
|
required_exts=[file_path.suffix],
|
||||||
|
).load_data()
|
||||||
|
documents.extend(default_docs)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not process {file_path}: {e}")
|
||||||
|
|
||||||
|
# Load other file types with default reader
|
||||||
|
try:
|
||||||
|
# Create a custom file filter function using our PathSpec
|
||||||
|
def file_filter(
|
||||||
|
file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
|
||||||
|
) -> bool:
|
||||||
|
"""Return True if file should be included (not excluded)"""
|
||||||
|
try:
|
||||||
|
docs_path_obj = Path(docs_dir)
|
||||||
|
file_path_obj = Path(file_path)
|
||||||
|
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||||
|
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||||
|
except (ValueError, OSError):
|
||||||
|
return True # Include files that can't be processed
|
||||||
|
|
||||||
|
other_docs = SimpleDirectoryReader(
|
||||||
|
docs_dir,
|
||||||
|
recursive=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
required_exts=code_extensions,
|
||||||
|
file_extractor={}, # Use default extractors
|
||||||
|
filename_as_id=True,
|
||||||
|
).load_data(show_progress=True)
|
||||||
|
|
||||||
|
# Filter documents after loading based on gitignore rules
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in other_docs:
|
||||||
|
file_path = doc.metadata.get("file_path", "")
|
||||||
|
if file_filter(file_path):
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
|
||||||
|
documents.extend(filtered_docs)
|
||||||
|
except ValueError as e:
|
||||||
|
if "No files found" in str(e):
|
||||||
|
print(f"No additional files found for other supported types in {docs_dir}.")
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
all_documents.extend(documents)
|
||||||
|
print(f"Loaded {len(documents)} documents from {docs_dir}")
|
||||||
|
|
||||||
|
documents = all_documents
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
|
|
||||||
@@ -507,7 +640,9 @@ Examples:
|
|||||||
".jl",
|
".jl",
|
||||||
}
|
}
|
||||||
|
|
||||||
for doc in documents:
|
print("start chunking documents")
|
||||||
|
# Add progress bar for document chunking
|
||||||
|
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
|
||||||
# Check if this is a code file based on source path
|
# Check if this is a code file based on source path
|
||||||
source_path = doc.metadata.get("source", "")
|
source_path = doc.metadata.get("source", "")
|
||||||
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
|
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
|
||||||
@@ -523,7 +658,7 @@ Examples:
|
|||||||
return all_texts
|
return all_texts
|
||||||
|
|
||||||
async def build_index(self, args):
|
async def build_index(self, args):
|
||||||
docs_dir = args.docs
|
docs_paths = args.docs
|
||||||
# Use current directory name if index_name not provided
|
# Use current directory name if index_name not provided
|
||||||
if args.index_name:
|
if args.index_name:
|
||||||
index_name = args.index_name
|
index_name = args.index_name
|
||||||
@@ -534,13 +669,25 @@ Examples:
|
|||||||
index_dir = self.indexes_dir / index_name
|
index_dir = self.indexes_dir / index_name
|
||||||
index_path = self.get_index_path(index_name)
|
index_path = self.get_index_path(index_name)
|
||||||
|
|
||||||
print(f"📂 Indexing: {Path(docs_dir).resolve()}")
|
# Display all paths being indexed with file/directory distinction
|
||||||
|
files = [p for p in docs_paths if Path(p).is_file()]
|
||||||
|
directories = [p for p in docs_paths if Path(p).is_dir()]
|
||||||
|
|
||||||
|
print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
|
||||||
|
if files:
|
||||||
|
print(f" 📄 Files ({len(files)}):")
|
||||||
|
for i, file_path in enumerate(files, 1):
|
||||||
|
print(f" {i}. {Path(file_path).resolve()}")
|
||||||
|
if directories:
|
||||||
|
print(f" 📁 Directories ({len(directories)}):")
|
||||||
|
for i, dir_path in enumerate(directories, 1):
|
||||||
|
print(f" {i}. {Path(dir_path).resolve()}")
|
||||||
|
|
||||||
if index_dir.exists() and not args.force:
|
if index_dir.exists() and not args.force:
|
||||||
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
||||||
return
|
return
|
||||||
|
|
||||||
all_texts = self.load_documents(docs_dir, args.file_types)
|
all_texts = self.load_documents(docs_paths, args.file_types)
|
||||||
if not all_texts:
|
if not all_texts:
|
||||||
print("No documents found")
|
print("No documents found")
|
||||||
return
|
return
|
||||||
@@ -576,7 +723,7 @@ Examples:
|
|||||||
|
|
||||||
if not self.index_exists(index_name):
|
if not self.index_exists(index_name):
|
||||||
print(
|
print(
|
||||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -603,7 +750,7 @@ Examples:
|
|||||||
|
|
||||||
if not self.index_exists(index_name):
|
if not self.index_exists(index_name):
|
||||||
print(
|
print(
|
||||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ Preserves all optimization parameters to ensure performance
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -374,7 +373,9 @@ def compute_embeddings_ollama(
|
|||||||
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
|
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Compute embeddings using Ollama API.
|
Compute embeddings using Ollama API with simplified batch processing.
|
||||||
|
|
||||||
|
Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
texts: List of texts to compute embeddings for
|
texts: List of texts to compute embeddings for
|
||||||
@@ -438,12 +439,19 @@ def compute_embeddings_ollama(
|
|||||||
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
|
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
|
||||||
embedding_models.append(model)
|
embedding_models.append(model)
|
||||||
|
|
||||||
# Check if model exists (handle versioned names)
|
# Check if model exists (handle versioned names) and resolve to full name
|
||||||
model_found = any(
|
resolved_model_name = None
|
||||||
model_name == name.split(":")[0] or model_name == name for name in model_names
|
for name in model_names:
|
||||||
)
|
# Exact match
|
||||||
|
if model_name == name:
|
||||||
|
resolved_model_name = name
|
||||||
|
break
|
||||||
|
# Match without version tag (use the versioned name)
|
||||||
|
elif model_name == name.split(":")[0]:
|
||||||
|
resolved_model_name = name
|
||||||
|
break
|
||||||
|
|
||||||
if not model_found:
|
if not resolved_model_name:
|
||||||
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
|
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
|
||||||
|
|
||||||
# Suggest pulling the model
|
# Suggest pulling the model
|
||||||
@@ -465,6 +473,11 @@ def compute_embeddings_ollama(
|
|||||||
error_msg += "\n📚 Browse more: https://ollama.com/library"
|
error_msg += "\n📚 Browse more: https://ollama.com/library"
|
||||||
raise ValueError(error_msg)
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Use the resolved model name for all subsequent operations
|
||||||
|
if resolved_model_name != model_name:
|
||||||
|
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
|
||||||
|
model_name = resolved_model_name
|
||||||
|
|
||||||
# Verify the model supports embeddings by testing it
|
# Verify the model supports embeddings by testing it
|
||||||
try:
|
try:
|
||||||
test_response = requests.post(
|
test_response = requests.post(
|
||||||
@@ -485,162 +498,147 @@ def compute_embeddings_ollama(
|
|||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.warning(f"Could not verify model existence: {e}")
|
logger.warning(f"Could not verify model existence: {e}")
|
||||||
|
|
||||||
# Process embeddings with optimized concurrent processing
|
# Determine batch size based on device availability
|
||||||
import requests
|
# Check for CUDA/MPS availability using torch if available
|
||||||
|
batch_size = 32 # Default for MPS/CPU
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
def get_single_embedding(text_idx_tuple):
|
if torch.cuda.is_available():
|
||||||
"""Helper function to get embedding for a single text."""
|
batch_size = 128 # CUDA gets larger batch size
|
||||||
text, idx = text_idx_tuple
|
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||||
max_retries = 3
|
batch_size = 32 # MPS gets smaller batch size
|
||||||
retry_count = 0
|
except ImportError:
|
||||||
|
# If torch is not available, use conservative batch size
|
||||||
|
batch_size = 32
|
||||||
|
|
||||||
# Truncate very long texts to avoid API issues
|
logger.info(f"Using batch size: {batch_size}")
|
||||||
truncated_text = text[:8000] if len(text) > 8000 else text
|
|
||||||
|
|
||||||
while retry_count < max_retries:
|
def get_batch_embeddings(batch_texts):
|
||||||
try:
|
"""Get embeddings for a batch of texts."""
|
||||||
response = requests.post(
|
all_embeddings = []
|
||||||
f"{host}/api/embeddings",
|
failed_indices = []
|
||||||
json={"model": model_name, "prompt": truncated_text},
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
result = response.json()
|
for i, text in enumerate(batch_texts):
|
||||||
embedding = result.get("embedding")
|
max_retries = 3
|
||||||
|
retry_count = 0
|
||||||
|
|
||||||
if embedding is None:
|
# Truncate very long texts to avoid API issues
|
||||||
raise ValueError(f"No embedding returned for text {idx}")
|
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||||
|
while retry_count < max_retries:
|
||||||
return idx, embedding
|
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
retry_count += 1
|
|
||||||
if retry_count >= max_retries:
|
|
||||||
logger.warning(f"Timeout for text {idx} after {max_retries} retries")
|
|
||||||
return idx, None
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
if retry_count >= max_retries - 1:
|
|
||||||
logger.error(f"Failed to get embedding for text {idx}: {e}")
|
|
||||||
return idx, None
|
|
||||||
retry_count += 1
|
|
||||||
|
|
||||||
return idx, None
|
|
||||||
|
|
||||||
# Determine if we should use concurrent processing
|
|
||||||
use_concurrent = (
|
|
||||||
len(texts) > 5 and not is_build
|
|
||||||
) # Don't use concurrent in build mode to avoid overwhelming
|
|
||||||
max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama
|
|
||||||
|
|
||||||
all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order
|
|
||||||
failed_indices = []
|
|
||||||
|
|
||||||
if use_concurrent:
|
|
||||||
logger.info(
|
|
||||||
f"Using concurrent processing with {max_workers} workers for {len(texts)} texts"
|
|
||||||
)
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
||||||
# Submit all tasks
|
|
||||||
future_to_idx = {
|
|
||||||
executor.submit(get_single_embedding, (text, idx)): idx
|
|
||||||
for idx, text in enumerate(texts)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add progress bar for concurrent processing
|
|
||||||
try:
|
|
||||||
if is_build or len(texts) > 10:
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
futures_iterator = tqdm(
|
|
||||||
as_completed(future_to_idx),
|
|
||||||
total=len(texts),
|
|
||||||
desc="Computing Ollama embeddings",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
futures_iterator = as_completed(future_to_idx)
|
|
||||||
except ImportError:
|
|
||||||
futures_iterator = as_completed(future_to_idx)
|
|
||||||
|
|
||||||
# Collect results as they complete
|
|
||||||
for future in futures_iterator:
|
|
||||||
try:
|
try:
|
||||||
idx, embedding = future.result()
|
response = requests.post(
|
||||||
if embedding is not None:
|
f"{host}/api/embeddings",
|
||||||
all_embeddings[idx] = embedding
|
json={"model": model_name, "prompt": truncated_text},
|
||||||
else:
|
timeout=30,
|
||||||
failed_indices.append(idx)
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
embedding = result.get("embedding")
|
||||||
|
|
||||||
|
if embedding is None:
|
||||||
|
raise ValueError(f"No embedding returned for text {i}")
|
||||||
|
|
||||||
|
if not isinstance(embedding, list) or len(embedding) == 0:
|
||||||
|
raise ValueError(f"Invalid embedding format for text {i}")
|
||||||
|
|
||||||
|
all_embeddings.append(embedding)
|
||||||
|
break
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
retry_count += 1
|
||||||
|
if retry_count >= max_retries:
|
||||||
|
logger.warning(f"Timeout for text {i} after {max_retries} retries")
|
||||||
|
failed_indices.append(i)
|
||||||
|
all_embeddings.append(None)
|
||||||
|
break
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
idx = future_to_idx[future]
|
retry_count += 1
|
||||||
logger.error(f"Exception for text {idx}: {e}")
|
if retry_count >= max_retries:
|
||||||
failed_indices.append(idx)
|
logger.error(f"Failed to get embedding for text {i}: {e}")
|
||||||
|
failed_indices.append(i)
|
||||||
|
all_embeddings.append(None)
|
||||||
|
break
|
||||||
|
return all_embeddings, failed_indices
|
||||||
|
|
||||||
|
# Process texts in batches
|
||||||
|
all_embeddings = []
|
||||||
|
all_failed_indices = []
|
||||||
|
|
||||||
|
# Setup progress bar if needed
|
||||||
|
show_progress = is_build or len(texts) > 10
|
||||||
|
try:
|
||||||
|
if show_progress:
|
||||||
|
from tqdm import tqdm
|
||||||
|
except ImportError:
|
||||||
|
show_progress = False
|
||||||
|
|
||||||
|
# Process batches
|
||||||
|
num_batches = (len(texts) + batch_size - 1) // batch_size
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
|
||||||
else:
|
else:
|
||||||
# Sequential processing with progress bar
|
batch_iterator = range(num_batches)
|
||||||
show_progress = is_build or len(texts) > 10
|
|
||||||
|
|
||||||
try:
|
for batch_idx in batch_iterator:
|
||||||
if show_progress:
|
start_idx = batch_idx * batch_size
|
||||||
from tqdm import tqdm
|
end_idx = min(start_idx + batch_size, len(texts))
|
||||||
|
batch_texts = texts[start_idx:end_idx]
|
||||||
|
|
||||||
iterator = tqdm(
|
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
|
||||||
enumerate(texts), total=len(texts), desc="Computing Ollama embeddings"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
iterator = enumerate(texts)
|
|
||||||
except ImportError:
|
|
||||||
iterator = enumerate(texts)
|
|
||||||
|
|
||||||
for idx, text in iterator:
|
# Adjust failed indices to global indices
|
||||||
result_idx, embedding = get_single_embedding((text, idx))
|
global_failed = [start_idx + idx for idx in batch_failed]
|
||||||
if embedding is not None:
|
all_failed_indices.extend(global_failed)
|
||||||
all_embeddings[idx] = embedding
|
all_embeddings.extend(batch_embeddings)
|
||||||
else:
|
|
||||||
failed_indices.append(idx)
|
|
||||||
|
|
||||||
# Handle failed embeddings
|
# Handle failed embeddings
|
||||||
if failed_indices:
|
if all_failed_indices:
|
||||||
if len(failed_indices) == len(texts):
|
if len(all_failed_indices) == len(texts):
|
||||||
raise RuntimeError("Failed to compute any embeddings")
|
raise RuntimeError("Failed to compute any embeddings")
|
||||||
|
|
||||||
logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts")
|
logger.warning(
|
||||||
|
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
|
||||||
|
)
|
||||||
|
|
||||||
# Use zero embeddings as fallback for failed ones
|
# Use zero embeddings as fallback for failed ones
|
||||||
valid_embedding = next((e for e in all_embeddings if e is not None), None)
|
valid_embedding = next((e for e in all_embeddings if e is not None), None)
|
||||||
if valid_embedding:
|
if valid_embedding:
|
||||||
embedding_dim = len(valid_embedding)
|
embedding_dim = len(valid_embedding)
|
||||||
for idx in failed_indices:
|
for i, embedding in enumerate(all_embeddings):
|
||||||
all_embeddings[idx] = [0.0] * embedding_dim
|
if embedding is None:
|
||||||
|
all_embeddings[i] = [0.0] * embedding_dim
|
||||||
|
|
||||||
# Remove None values and convert to numpy array
|
# Remove None values
|
||||||
all_embeddings = [e for e in all_embeddings if e is not None]
|
all_embeddings = [e for e in all_embeddings if e is not None]
|
||||||
|
|
||||||
# Validate embedding dimensions before creating numpy array
|
if not all_embeddings:
|
||||||
if all_embeddings:
|
raise RuntimeError("No valid embeddings were computed")
|
||||||
expected_dim = len(all_embeddings[0])
|
|
||||||
inconsistent_dims = []
|
|
||||||
for i, embedding in enumerate(all_embeddings):
|
|
||||||
if len(embedding) != expected_dim:
|
|
||||||
inconsistent_dims.append((i, len(embedding)))
|
|
||||||
|
|
||||||
if inconsistent_dims:
|
# Validate embedding dimensions
|
||||||
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
expected_dim = len(all_embeddings[0])
|
||||||
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
inconsistent_dims = []
|
||||||
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
for i, embedding in enumerate(all_embeddings):
|
||||||
if len(inconsistent_dims) > 10:
|
if len(embedding) != expected_dim:
|
||||||
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
inconsistent_dims.append((i, len(embedding)))
|
||||||
error_msg += (
|
|
||||||
f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
if inconsistent_dims:
|
||||||
)
|
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
||||||
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
||||||
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
||||||
error_msg += (
|
if len(inconsistent_dims) > 10:
|
||||||
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
||||||
)
|
error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
||||||
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
||||||
raise ValueError(error_msg)
|
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
||||||
|
error_msg += (
|
||||||
|
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
||||||
|
)
|
||||||
|
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
# Convert to numpy array and normalize
|
# Convert to numpy array and normalize
|
||||||
embeddings = np.array(all_embeddings, dtype=np.float32)
|
embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||||
|
|||||||
@@ -45,6 +45,42 @@ leann build my-project --docs ./
|
|||||||
claude
|
claude
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🚀 Advanced Usage Examples
|
||||||
|
|
||||||
|
### Index Entire Git Repository
|
||||||
|
```bash
|
||||||
|
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
|
||||||
|
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index only specific file types from git
|
||||||
|
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multiple Directories and Files
|
||||||
|
```bash
|
||||||
|
# Index multiple directories
|
||||||
|
leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Mix files and directories
|
||||||
|
leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Specific files only
|
||||||
|
leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
### Advanced Git Integration
|
||||||
|
```bash
|
||||||
|
# Index recently modified files
|
||||||
|
leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index files matching pattern
|
||||||
|
leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index documentation and config files
|
||||||
|
leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
**Try this in Claude Code:**
|
**Try this in Claude Code:**
|
||||||
```
|
```
|
||||||
Help me understand this codebase. List available indexes and search for authentication patterns.
|
Help me understand this codebase. List available indexes and search for authentication patterns.
|
||||||
|
|||||||
Reference in New Issue
Block a user