From 38ec6aae113a93d945a1a3b7d6466359302543bf Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Sat, 9 Aug 2025 19:38:38 -0700
Subject: [PATCH] improve CLI with auto project name and .gitignore support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make index_name optional, auto-use current directory name
- Read .gitignore patterns and respect them during indexing
- Add _read_gitignore_patterns() to parse .gitignore files
- Add _should_exclude_file() for pattern matching
- Apply exclusion patterns to both PDF and general file processing
- Show helpful messages about gitignore usage

Now users can simply run: leann build
And it will use project name + respect .gitignore patterns.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 packages/leann-core/src/leann/cli.py | 129 ++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 24 deletions(-)

diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index f307204..5171afa 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -86,7 +86,9 @@ Examples:
 
         # Build command
         build_parser = subparsers.add_parser("build", help="Build document index")
-        build_parser.add_argument("index_name", help="Index name")
+        build_parser.add_argument(
+            "index_name", nargs="?", help="Index name (default: current directory name)"
+        )
         build_parser.add_argument(
             "--docs", type=str, default=".", help="Documents directory (default: current directory)"
         )
@@ -201,6 +203,63 @@ Examples:
         with open(global_registry, "w") as f:
             json.dump(projects, f, indent=2)
 
+    def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
+        """Read .gitignore file and return patterns for exclusion."""
+        gitignore_path = Path(docs_dir) / ".gitignore"
+        patterns = []
+
+        # Add some essential patterns that should always be excluded
+        essential_patterns = [
+            ".git",
+            ".DS_Store",
+        ]
+        patterns.extend(essential_patterns)
+
+        if gitignore_path.exists():
+            try:
+                with open(gitignore_path, encoding="utf-8") as f:
+                    for line in f:
+                        line = line.strip()
+                        # Skip empty lines and comments
+                        if line and not line.startswith("#"):
+                            # Remove leading slash if present (make it relative)
+                            if line.startswith("/"):
+                                line = line[1:]
+                            patterns.append(line)
+                print(
+                    f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
+                )
+            except Exception as e:
+                print(f"Warning: Could not read .gitignore: {e}")
+        else:
+            print("📋 No .gitignore found, using minimal exclusion patterns")
+
+        return patterns
+
+    def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
+        """Check if a file should be excluded based on gitignore-style patterns."""
+        path_str = str(relative_path)
+
+        for pattern in exclude_patterns:
+            # Simple pattern matching (could be enhanced with full gitignore syntax)
+            if pattern.endswith("*"):
+                # Wildcard pattern
+                prefix = pattern[:-1]
+                if path_str.startswith(prefix):
+                    return True
+            elif "*" in pattern:
+                # Contains wildcard - simple glob-like matching
+                import fnmatch
+
+                if fnmatch.fnmatch(path_str, pattern):
+                    return True
+            else:
+                # Exact match or directory match
+                if path_str == pattern or path_str.startswith(pattern + "/"):
+                    return True
+
+        return False
+
     def list_indexes(self):
         print("Stored LEANN indexes:")
 
@@ -282,34 +341,49 @@ Examples:
         if custom_file_types:
             print(f"Using custom file types: {custom_file_types}")
 
-        # Try to use better PDF parsers first
+        # Read .gitignore patterns first
+        exclude_patterns = self._read_gitignore_patterns(docs_dir)
+
+        # Try to use better PDF parsers first, but only if PDFs are requested
         documents = []
         docs_path = Path(docs_dir)
 
-        for file_path in docs_path.rglob("*.pdf"):
-            print(f"Processing PDF: {file_path}")
+        # Check if we should process PDFs
+        should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
 
-            # Try PyMuPDF first (best quality)
-            text = extract_pdf_text_with_pymupdf(str(file_path))
-            if text is None:
-                # Try pdfplumber
-                text = extract_pdf_text_with_pdfplumber(str(file_path))
+        if should_process_pdfs:
+            for file_path in docs_path.rglob("*.pdf"):
+                # Check if file matches any exclude pattern
+                relative_path = file_path.relative_to(docs_path)
+                if self._should_exclude_file(relative_path, exclude_patterns):
+                    continue
 
-            if text:
-                # Create a simple document structure
-                from llama_index.core import Document
+                print(f"Processing PDF: {file_path}")
 
-                doc = Document(text=text, metadata={"source": str(file_path)})
-                documents.append(doc)
-            else:
-                # Fallback to default reader
-                print(f"Using default reader for {file_path}")
-                default_docs = SimpleDirectoryReader(
-                    str(file_path.parent),
-                    filename_as_id=True,
-                    required_exts=[file_path.suffix],
-                ).load_data()
-                documents.extend(default_docs)
+                # Try PyMuPDF first (best quality)
+                text = extract_pdf_text_with_pymupdf(str(file_path))
+                if text is None:
+                    # Try pdfplumber
+                    text = extract_pdf_text_with_pdfplumber(str(file_path))
+
+                if text:
+                    # Create a simple document structure
+                    from llama_index.core import Document
+
+                    doc = Document(text=text, metadata={"source": str(file_path)})
+                    documents.append(doc)
+                else:
+                    # Fallback to default reader
+                    print(f"Using default reader for {file_path}")
+                    try:
+                        default_docs = SimpleDirectoryReader(
+                            str(file_path.parent),
+                            filename_as_id=True,
+                            required_exts=[file_path.suffix],
+                        ).load_data()
+                        documents.extend(default_docs)
+                    except Exception as e:
+                        print(f"Warning: Could not process {file_path}: {e}")
 
         # Load other file types with default reader
         if custom_file_types:
@@ -380,6 +454,7 @@ Examples:
                 recursive=True,
                 encoding="utf-8",
                 required_exts=code_extensions,
+                exclude=exclude_patterns,
             ).load_data(show_progress=True)
             documents.extend(other_docs)
         except ValueError as e:
@@ -454,7 +529,13 @@ Examples:
 
     async def build_index(self, args):
         docs_dir = args.docs
-        index_name = args.index_name
+        # Use current directory name if index_name not provided
+        if args.index_name:
+            index_name = args.index_name
+        else:
+            index_name = Path.cwd().name
+            print(f"Using current directory name as index: '{index_name}'")
+
         index_dir = self.indexes_dir / index_name
         index_path = self.get_index_path(index_name)