feat: support multiple input formats for --docs argument

- Add support for multiple directories: --docs ./src ./tests ./config - Add support for individual files: --docs ./file1.py ./file2.txt - Add support for mixed files and directories: --docs ./README.md ./src/ ./config.json - Add git ls-files integration: --docs $(git ls-files) - Add git submodule detection and skip logic to avoid indexing third-party dependencies - Add comprehensive error handling for path resolution issues - Update MCP README with advanced usage examples including git integration - Fix ruff linting issues with closure variable binding Breaking changes: None - fully backward compatible with existing single directory usage Examples: leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw leann build my-code --docs ./src ./tests ./docs --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw leann build my-configs --docs ./package.json ./tsconfig.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
2025-08-12 02:04:44 -07:00
3 changed files with 267 additions and 84 deletions
@@ -468,7 +468,7 @@ leann --help
 ### Usage Examples
 ```bash
-# build from a specific directory, and my_docs is the index name
+# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
 leann build my-docs --docs ./your_documents
 # Search your documents
@@ -5,6 +5,7 @@ from typing import Union
 from llama_index.core import SimpleDirectoryReader
 from llama_index.core.node_parser import SentenceSplitter
 from tqdm import tqdm
 from .api import LeannBuilder, LeannChat, LeannSearcher
@@ -75,11 +76,14 @@ class LeannCLI:
            formatter_class=argparse.RawDescriptionHelpFormatter,
            epilog="""
 Examples:
-  leann build my-docs --docs ./documents                    # Build index named my-docs
+  leann build my-docs --docs ./documents                                  # Build index from directory
-  leann build my-ppts --docs ./ --file-types .pptx,.pdf    # Index only PowerPoint and PDF files
+  leann build my-code --docs ./src ./tests ./config                      # Build index from multiple directories
-  leann search my-docs "query"                             # Search in my-docs index
+  leann build my-files --docs ./file1.py ./file2.txt ./docs/             # Build index from files and directories
-  leann ask my-docs "question"                             # Ask my-docs index
+  leann build my-mixed --docs ./readme.md ./src/ ./config.json           # Build index from mixed files/dirs
-  leann list                                              # List all stored indexes
+  leann build my-ppts --docs ./ --file-types .pptx,.pdf                  # Index only PowerPoint and PDF files
  leann search my-docs "query"                                           # Search in my-docs index
  leann ask my-docs "question"                                           # Ask my-docs index
  leann list                                                             # List all stored indexes
            """,
        )
@@ -91,7 +95,11 @@ Examples:
            "index_name", nargs="?", help="Index name (default: current directory name)"
        )
        build_parser.add_argument(
-            "--docs", type=str, default=".", help="Documents directory (default: current directory)"
+            "--docs",
            type=str,
            nargs="+",
            default=["."],
            help="Documents directories and/or files (default: current directory)",
        )
        build_parser.add_argument(
            "--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
@@ -235,6 +243,32 @@ Examples:
        """Check if a file should be excluded using gitignore parser."""
        return gitignore_matches(str(relative_path))
    def _is_git_submodule(self, path: Path) -> bool:
        """Check if a path is a git submodule."""
        try:
            # Find the git repo root
            current_dir = Path.cwd()
            while current_dir != current_dir.parent:
                if (current_dir / ".git").exists():
                    gitmodules_path = current_dir / ".gitmodules"
                    if gitmodules_path.exists():
                        # Read .gitmodules to check if this path is a submodule
                        gitmodules_content = gitmodules_path.read_text()
                        # Convert path to relative to git root
                        try:
                            relative_path = path.resolve().relative_to(current_dir)
                            # Check if this path appears in .gitmodules
                            return f"path = {relative_path}" in gitmodules_content
                        except ValueError:
                            # Path is not under git root
                            return False
                    break
                current_dir = current_dir.parent
            return False
        except Exception:
            # If anything goes wrong, assume it's not a submodule
            return False
    def list_indexes(self):
        print("Stored LEANN indexes:")
@@ -264,7 +298,9 @@ Examples:
            valid_projects.append(current_path)
        if not valid_projects:
-            print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
+            print(
                "No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
            )
            return
        total_indexes = 0
@@ -311,56 +347,88 @@ Examples:
                    print(f'  leann search {example_name} "your query"')
                    print(f"  leann ask {example_name} --interactive")
-    def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None):
+    def load_documents(
-        print(f"Loading documents from {docs_dir}...")
+        self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
    ):
        # Handle both single path (string) and multiple paths (list) for backward compatibility
        if isinstance(docs_paths, str):
            docs_paths = [docs_paths]
        # Separate files and directories
        files = []
        directories = []
        for path in docs_paths:
            path_obj = Path(path)
            if path_obj.is_file():
                files.append(str(path_obj))
            elif path_obj.is_dir():
                # Check if this is a git submodule - if so, skip it
                if self._is_git_submodule(path_obj):
                    print(f"⚠️  Skipping git submodule: {path}")
                    continue
                directories.append(str(path_obj))
            else:
                print(f"⚠️  Warning: Path '{path}' does not exist, skipping...")
                continue
        # Print summary of what we're processing
        total_items = len(files) + len(directories)
        items_desc = []
        if files:
            items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
        if directories:
            items_desc.append(
                f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
            )
        print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
        if files:
            print(f"  📄 Files: {', '.join([Path(f).name for f in files])}")
        if directories:
            print(f"  📁 Directories: {', '.join(directories)}")
        if custom_file_types:
            print(f"Using custom file types: {custom_file_types}")
-        # Build gitignore parser
+        all_documents = []
        gitignore_matches = self._build_gitignore_parser(docs_dir)
-        # Try to use better PDF parsers first, but only if PDFs are requested
+        # First, process individual files if any
-        documents = []
+        if files:
-        docs_path = Path(docs_dir)
+            print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
-        # Check if we should process PDFs
+            # Load individual files using SimpleDirectoryReader with input_files
-        should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
+            # Note: We skip gitignore filtering for explicitly specified files
            try:
                # Group files by their parent directory for efficient loading
                from collections import defaultdict
-        if should_process_pdfs:
+                files_by_dir = defaultdict(list)
-            for file_path in docs_path.rglob("*.pdf"):
+                for file_path in files:
-                # Check if file matches any exclude pattern
+                    parent_dir = str(Path(file_path).parent)
-                relative_path = file_path.relative_to(docs_path)
+                    files_by_dir[parent_dir].append(file_path)
                if self._should_exclude_file(relative_path, gitignore_matches):
                    continue
-                print(f"Processing PDF: {file_path}")
+                # Load files from each parent directory
-
+                for parent_dir, file_list in files_by_dir.items():
-                # Try PyMuPDF first (best quality)
+                    print(
-                text = extract_pdf_text_with_pymupdf(str(file_path))
+                        f"  Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
-                if text is None:
+                    )
                    # Try pdfplumber
                    text = extract_pdf_text_with_pdfplumber(str(file_path))
                if text:
                    # Create a simple document structure
                    from llama_index.core import Document
                    doc = Document(text=text, metadata={"source": str(file_path)})
                    documents.append(doc)
                else:
                    # Fallback to default reader
                    print(f"Using default reader for {file_path}")
                    try:
-                        default_docs = SimpleDirectoryReader(
+                        file_docs = SimpleDirectoryReader(
-                            str(file_path.parent),
+                            parent_dir,
                            input_files=file_list,
                            filename_as_id=True,
                            required_exts=[file_path.suffix],
                        ).load_data()
-                        documents.extend(default_docs)
+                        all_documents.extend(file_docs)
                        print(
                            f"    ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
                        )
                    except Exception as e:
-                        print(f"Warning: Could not process {file_path}: {e}")
+                        print(f"    ❌ Warning: Could not load files from {parent_dir}: {e}")
-        # Load other file types with default reader
+            except Exception as e:
                print(f"❌ Error processing individual files: {e}")
        # Define file extensions to process
        if custom_file_types:
            # Parse custom file types from comma-separated string
            code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
@@ -422,41 +490,106 @@ Examples:
                ".py",
                ".jl",
            ]
        # Try to load other file types, but don't fail if none are found
        try:
            # Create a custom file filter function using our PathSpec
            def file_filter(file_path: str) -> bool:
                """Return True if file should be included (not excluded)"""
                try:
                    docs_path_obj = Path(docs_dir)
                    file_path_obj = Path(file_path)
                    relative_path = file_path_obj.relative_to(docs_path_obj)
                    return not self._should_exclude_file(relative_path, gitignore_matches)
                except (ValueError, OSError):
                    return True  # Include files that can't be processed
-            other_docs = SimpleDirectoryReader(
+        # Process each directory
-                docs_dir,
+        if directories:
-                recursive=True,
+            print(
-                encoding="utf-8",
+                f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
-                required_exts=code_extensions,
+            )
                file_extractor={},  # Use default extractors
                filename_as_id=True,
            ).load_data(show_progress=True)
-            # Filter documents after loading based on gitignore rules
+        for docs_dir in directories:
-            filtered_docs = []
+            print(f"Processing directory: {docs_dir}")
-            for doc in other_docs:
+            # Build gitignore parser for each directory
-                file_path = doc.metadata.get("file_path", "")
+            gitignore_matches = self._build_gitignore_parser(docs_dir)
                if file_filter(file_path):
                    filtered_docs.append(doc)
-            documents.extend(filtered_docs)
+            # Try to use better PDF parsers first, but only if PDFs are requested
-        except ValueError as e:
+            documents = []
-            if "No files found" in str(e):
+            docs_path = Path(docs_dir)
-                print("No additional files found for other supported types.")
+
-            else:
+            # Check if we should process PDFs
-                raise e
+            should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
            if should_process_pdfs:
                for file_path in docs_path.rglob("*.pdf"):
                    # Check if file matches any exclude pattern
                    try:
                        relative_path = file_path.relative_to(docs_path)
                        if self._should_exclude_file(relative_path, gitignore_matches):
                            continue
                    except ValueError:
                        # Skip files that can't be made relative to docs_path
                        print(f"⚠️  Skipping file outside directory scope: {file_path}")
                        continue
                    print(f"Processing PDF: {file_path}")
                    # Try PyMuPDF first (best quality)
                    text = extract_pdf_text_with_pymupdf(str(file_path))
                    if text is None:
                        # Try pdfplumber
                        text = extract_pdf_text_with_pdfplumber(str(file_path))
                    if text:
                        # Create a simple document structure
                        from llama_index.core import Document
                        doc = Document(text=text, metadata={"source": str(file_path)})
                        documents.append(doc)
                    else:
                        # Fallback to default reader
                        print(f"Using default reader for {file_path}")
                        try:
                            default_docs = SimpleDirectoryReader(
                                str(file_path.parent),
                                filename_as_id=True,
                                required_exts=[file_path.suffix],
                            ).load_data()
                            documents.extend(default_docs)
                        except Exception as e:
                            print(f"Warning: Could not process {file_path}: {e}")
            # Load other file types with default reader
            try:
                # Create a custom file filter function using our PathSpec
                def file_filter(
                    file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
                ) -> bool:
                    """Return True if file should be included (not excluded)"""
                    try:
                        docs_path_obj = Path(docs_dir)
                        file_path_obj = Path(file_path)
                        relative_path = file_path_obj.relative_to(docs_path_obj)
                        return not self._should_exclude_file(relative_path, gitignore_matches)
                    except (ValueError, OSError):
                        return True  # Include files that can't be processed
                other_docs = SimpleDirectoryReader(
                    docs_dir,
                    recursive=True,
                    encoding="utf-8",
                    required_exts=code_extensions,
                    file_extractor={},  # Use default extractors
                    filename_as_id=True,
                ).load_data(show_progress=True)
                # Filter documents after loading based on gitignore rules
                filtered_docs = []
                for doc in other_docs:
                    file_path = doc.metadata.get("file_path", "")
                    if file_filter(file_path):
                        filtered_docs.append(doc)
                documents.extend(filtered_docs)
            except ValueError as e:
                if "No files found" in str(e):
                    print(f"No additional files found for other supported types in {docs_dir}.")
                else:
                    raise e
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} documents from {docs_dir}")
        documents = all_documents
        all_texts = []
@@ -507,7 +640,9 @@ Examples:
            ".jl",
        }
-        for doc in documents:
+        print("start chunking documents")
        # Add progress bar for document chunking
        for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
            # Check if this is a code file based on source path
            source_path = doc.metadata.get("source", "")
            is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
@@ -523,7 +658,7 @@ Examples:
        return all_texts
    async def build_index(self, args):
-        docs_dir = args.docs
+        docs_paths = args.docs
        # Use current directory name if index_name not provided
        if args.index_name:
            index_name = args.index_name
@@ -534,13 +669,25 @@ Examples:
        index_dir = self.indexes_dir / index_name
        index_path = self.get_index_path(index_name)
-        print(f"📂 Indexing: {Path(docs_dir).resolve()}")
+        # Display all paths being indexed with file/directory distinction
        files = [p for p in docs_paths if Path(p).is_file()]
        directories = [p for p in docs_paths if Path(p).is_dir()]
        print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
        if files:
            print(f"  📄 Files ({len(files)}):")
            for i, file_path in enumerate(files, 1):
                print(f"    {i}. {Path(file_path).resolve()}")
        if directories:
            print(f"  📁 Directories ({len(directories)}):")
            for i, dir_path in enumerate(directories, 1):
                print(f"    {i}. {Path(dir_path).resolve()}")
        if index_dir.exists() and not args.force:
            print(f"Index '{index_name}' already exists. Use --force to rebuild.")
            return
-        all_texts = self.load_documents(docs_dir, args.file_types)
+        all_texts = self.load_documents(docs_paths, args.file_types)
        if not all_texts:
            print("No documents found")
            return
@@ -576,7 +723,7 @@ Examples:
        if not self.index_exists(index_name):
            print(
-                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
+                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
            )
            return
@@ -603,7 +750,7 @@ Examples:
        if not self.index_exists(index_name):
            print(
-                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
+                f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
            )
            return
@@ -45,6 +45,42 @@ leann build my-project --docs ./
 claude
 ```
 ## 🚀 Advanced Usage Examples
 ### Index Entire Git Repository
 ```bash
 # Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
 leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 # Index only specific file types from git
 leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 ```
 ### Multiple Directories and Files
 ```bash
 # Index multiple directories
 leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 # Mix files and directories
 leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 # Specific files only
 leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 ```
 ### Advanced Git Integration
 ```bash
 # Index recently modified files
 leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 # Index files matching pattern
 leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 # Index documentation and config files
 leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
 ```
 **Try this in Claude Code:**
 ```
 Help me understand this codebase. List available indexes and search for authentication patterns.