From 38ec6aae113a93d945a1a3b7d6466359302543bf Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sat, 9 Aug 2025 19:38:38 -0700 Subject: [PATCH] improve CLI with auto project name and .gitignore support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make index_name optional, auto-use current directory name - Read .gitignore patterns and respect them during indexing - Add _read_gitignore_patterns() to parse .gitignore files - Add _should_exclude_file() for pattern matching - Apply exclusion patterns to both PDF and general file processing - Show helpful messages about gitignore usage Now users can simply run: leann build And it will use project name + respect .gitignore patterns. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- packages/leann-core/src/leann/cli.py | 129 ++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 24 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index f307204..5171afa 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -86,7 +86,9 @@ Examples: # Build command build_parser = subparsers.add_parser("build", help="Build document index") - build_parser.add_argument("index_name", help="Index name") + build_parser.add_argument( + "index_name", nargs="?", help="Index name (default: current directory name)" + ) build_parser.add_argument( "--docs", type=str, default=".", help="Documents directory (default: current directory)" ) @@ -201,6 +203,63 @@ Examples: with open(global_registry, "w") as f: json.dump(projects, f, indent=2) + def _read_gitignore_patterns(self, docs_dir: str) -> list[str]: + """Read .gitignore file and return patterns for exclusion.""" + gitignore_path = Path(docs_dir) / ".gitignore" + patterns = [] + + # Add some essential patterns that should always be excluded + essential_patterns = [ + ".git", + ".DS_Store", + ] + patterns.extend(essential_patterns) + + if gitignore_path.exists(): + try: + with open(gitignore_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if line and not line.startswith("#"): + # Remove leading slash if present (make it relative) + if line.startswith("/"): + line = line[1:] + patterns.append(line) + print( + f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore" + ) + except Exception as e: + print(f"Warning: Could not read .gitignore: {e}") + else: + print("📋 No .gitignore found, using minimal exclusion patterns") + + return patterns + + def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool: + """Check if a file should be excluded based on gitignore-style patterns.""" + path_str = str(relative_path) + + for pattern in exclude_patterns: + # Simple pattern matching (could be enhanced with full gitignore syntax) + if pattern.endswith("*"): + # Wildcard pattern + prefix = pattern[:-1] + if path_str.startswith(prefix): + return True + elif "*" in pattern: + # Contains wildcard - simple glob-like matching + import fnmatch + + if fnmatch.fnmatch(path_str, pattern): + return True + else: + # Exact match or directory match + if path_str == pattern or path_str.startswith(pattern + "/"): + return True + + return False + def list_indexes(self): print("Stored LEANN indexes:") @@ -282,34 +341,49 @@ Examples: if custom_file_types: print(f"Using custom file types: {custom_file_types}") - # Try to use better PDF parsers first + # Read .gitignore patterns first + exclude_patterns = self._read_gitignore_patterns(docs_dir) + + # Try to use better PDF parsers first, but only if PDFs are requested documents = [] docs_path = Path(docs_dir) - for file_path in docs_path.rglob("*.pdf"): - print(f"Processing PDF: {file_path}") + # Check if we should process PDFs + should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types - # Try PyMuPDF first (best quality) - text = extract_pdf_text_with_pymupdf(str(file_path)) - if text is None: - # Try pdfplumber - text = extract_pdf_text_with_pdfplumber(str(file_path)) + if should_process_pdfs: + for file_path in docs_path.rglob("*.pdf"): + # Check if file matches any exclude pattern + relative_path = file_path.relative_to(docs_path) + if self._should_exclude_file(relative_path, exclude_patterns): + continue - if text: - # Create a simple document structure - from llama_index.core import Document + print(f"Processing PDF: {file_path}") - doc = Document(text=text, metadata={"source": str(file_path)}) - documents.append(doc) - else: - # Fallback to default reader - print(f"Using default reader for {file_path}") - default_docs = SimpleDirectoryReader( - str(file_path.parent), - filename_as_id=True, - required_exts=[file_path.suffix], - ).load_data() - documents.extend(default_docs) + # Try PyMuPDF first (best quality) + text = extract_pdf_text_with_pymupdf(str(file_path)) + if text is None: + # Try pdfplumber + text = extract_pdf_text_with_pdfplumber(str(file_path)) + + if text: + # Create a simple document structure + from llama_index.core import Document + + doc = Document(text=text, metadata={"source": str(file_path)}) + documents.append(doc) + else: + # Fallback to default reader + print(f"Using default reader for {file_path}") + try: + default_docs = SimpleDirectoryReader( + str(file_path.parent), + filename_as_id=True, + required_exts=[file_path.suffix], + ).load_data() + documents.extend(default_docs) + except Exception as e: + print(f"Warning: Could not process {file_path}: {e}") # Load other file types with default reader if custom_file_types: @@ -380,6 +454,7 @@ Examples: recursive=True, encoding="utf-8", required_exts=code_extensions, + exclude=exclude_patterns, ).load_data(show_progress=True) documents.extend(other_docs) except ValueError as e: @@ -454,7 +529,13 @@ Examples: async def build_index(self, args): docs_dir = args.docs - index_name = args.index_name + # Use current directory name if index_name not provided + if args.index_name: + index_name = args.index_name + else: + index_name = Path.cwd().name + print(f"Using current directory name as index: '{index_name}'") + index_dir = self.indexes_dir / index_name index_path = self.get_index_path(index_name)