improve CLI with auto project name and .gitignore support

- Make index_name optional, auto-use current directory name - Read .gitignore patterns and respect them during indexing - Add _read_gitignore_patterns() to parse .gitignore files - Add _should_exclude_file() for pattern matching - Apply exclusion patterns to both PDF and general file processing - Show helpful messages about gitignore usage Now users can simply run: leann build And it will use project name + respect .gitignore patterns. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-09 19:38:38 -07:00
parent 1e5d05e36a
commit 38ec6aae11
1 changed files with 105 additions and 24 deletions
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -86,7 +86,9 @@ Examples:
        # Build command
        build_parser = subparsers.add_parser("build", help="Build document index")
-        build_parser.add_argument("index_name", help="Index name")
+        build_parser.add_argument(
            "index_name", nargs="?", help="Index name (default: current directory name)"
        )
        build_parser.add_argument(
            "--docs", type=str, default=".", help="Documents directory (default: current directory)"
        )
@@ -201,6 +203,63 @@ Examples:
        with open(global_registry, "w") as f:
            json.dump(projects, f, indent=2)
    def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
        """Read .gitignore file and return patterns for exclusion."""
        gitignore_path = Path(docs_dir) / ".gitignore"
        patterns = []
        # Add some essential patterns that should always be excluded
        essential_patterns = [
            ".git",
            ".DS_Store",
        ]
        patterns.extend(essential_patterns)
        if gitignore_path.exists():
            try:
                with open(gitignore_path, encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        # Skip empty lines and comments
                        if line and not line.startswith("#"):
                            # Remove leading slash if present (make it relative)
                            if line.startswith("/"):
                                line = line[1:]
                            patterns.append(line)
                print(
                    f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
                )
            except Exception as e:
                print(f"Warning: Could not read .gitignore: {e}")
        else:
            print("📋 No .gitignore found, using minimal exclusion patterns")
        return patterns
    def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
        """Check if a file should be excluded based on gitignore-style patterns."""
        path_str = str(relative_path)
        for pattern in exclude_patterns:
            # Simple pattern matching (could be enhanced with full gitignore syntax)
            if pattern.endswith("*"):
                # Wildcard pattern
                prefix = pattern[:-1]
                if path_str.startswith(prefix):
                    return True
            elif "*" in pattern:
                # Contains wildcard - simple glob-like matching
                import fnmatch
                if fnmatch.fnmatch(path_str, pattern):
                    return True
            else:
                # Exact match or directory match
                if path_str == pattern or path_str.startswith(pattern + "/"):
                    return True
        return False
    def list_indexes(self):
        print("Stored LEANN indexes:")
@@ -282,34 +341,49 @@ Examples:
        if custom_file_types:
            print(f"Using custom file types: {custom_file_types}")
-        # Try to use better PDF parsers first
+        # Read .gitignore patterns first
        exclude_patterns = self._read_gitignore_patterns(docs_dir)
        # Try to use better PDF parsers first, but only if PDFs are requested
        documents = []
        docs_path = Path(docs_dir)
-        for file_path in docs_path.rglob("*.pdf"):
+        # Check if we should process PDFs
-            print(f"Processing PDF: {file_path}")
+        should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
-            # Try PyMuPDF first (best quality)
+        if should_process_pdfs:
-            text = extract_pdf_text_with_pymupdf(str(file_path))
+            for file_path in docs_path.rglob("*.pdf"):
-            if text is None:
+                # Check if file matches any exclude pattern
-                # Try pdfplumber
+                relative_path = file_path.relative_to(docs_path)
-                text = extract_pdf_text_with_pdfplumber(str(file_path))
+                if self._should_exclude_file(relative_path, exclude_patterns):
                    continue
-            if text:
+                print(f"Processing PDF: {file_path}")
                # Create a simple document structure
                from llama_index.core import Document
-                doc = Document(text=text, metadata={"source": str(file_path)})
+                # Try PyMuPDF first (best quality)
-                documents.append(doc)
+                text = extract_pdf_text_with_pymupdf(str(file_path))
-            else:
+                if text is None:
-                # Fallback to default reader
+                    # Try pdfplumber
-                print(f"Using default reader for {file_path}")
+                    text = extract_pdf_text_with_pdfplumber(str(file_path))
-                default_docs = SimpleDirectoryReader(
+
-                    str(file_path.parent),
+                if text:
-                    filename_as_id=True,
+                    # Create a simple document structure
-                    required_exts=[file_path.suffix],
+                    from llama_index.core import Document
-                ).load_data()
+
-                documents.extend(default_docs)
+                    doc = Document(text=text, metadata={"source": str(file_path)})
                    documents.append(doc)
                else:
                    # Fallback to default reader
                    print(f"Using default reader for {file_path}")
                    try:
                        default_docs = SimpleDirectoryReader(
                            str(file_path.parent),
                            filename_as_id=True,
                            required_exts=[file_path.suffix],
                        ).load_data()
                        documents.extend(default_docs)
                    except Exception as e:
                        print(f"Warning: Could not process {file_path}: {e}")
        # Load other file types with default reader
        if custom_file_types:
@@ -380,6 +454,7 @@ Examples:
                recursive=True,
                encoding="utf-8",
                required_exts=code_extensions,
                exclude=exclude_patterns,
            ).load_data(show_progress=True)
            documents.extend(other_docs)
        except ValueError as e:
@@ -454,7 +529,13 @@ Examples:
    async def build_index(self, args):
        docs_dir = args.docs
-        index_name = args.index_name
+        # Use current directory name if index_name not provided
        if args.index_name:
            index_name = args.index_name
        else:
            index_name = Path.cwd().name
            print(f"Using current directory name as index: '{index_name}'")
        index_dir = self.indexes_dir / index_name
        index_path = self.get_index_path(index_name)