improve CLI with auto project name and .gitignore support
- Make index_name optional, auto-use current directory name - Read .gitignore patterns and respect them during indexing - Add _read_gitignore_patterns() to parse .gitignore files - Add _should_exclude_file() for pattern matching - Apply exclusion patterns to both PDF and general file processing - Show helpful messages about gitignore usage Now users can simply run: leann build And it will use project name + respect .gitignore patterns. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -86,7 +86,9 @@ Examples:
|
||||
|
||||
# Build command
|
||||
build_parser = subparsers.add_parser("build", help="Build document index")
|
||||
build_parser.add_argument("index_name", help="Index name")
|
||||
build_parser.add_argument(
|
||||
"index_name", nargs="?", help="Index name (default: current directory name)"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
|
||||
)
|
||||
@@ -201,6 +203,63 @@ Examples:
|
||||
with open(global_registry, "w") as f:
|
||||
json.dump(projects, f, indent=2)
|
||||
|
||||
def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
|
||||
"""Read .gitignore file and return patterns for exclusion."""
|
||||
gitignore_path = Path(docs_dir) / ".gitignore"
|
||||
patterns = []
|
||||
|
||||
# Add some essential patterns that should always be excluded
|
||||
essential_patterns = [
|
||||
".git",
|
||||
".DS_Store",
|
||||
]
|
||||
patterns.extend(essential_patterns)
|
||||
|
||||
if gitignore_path.exists():
|
||||
try:
|
||||
with open(gitignore_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
# Skip empty lines and comments
|
||||
if line and not line.startswith("#"):
|
||||
# Remove leading slash if present (make it relative)
|
||||
if line.startswith("/"):
|
||||
line = line[1:]
|
||||
patterns.append(line)
|
||||
print(
|
||||
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read .gitignore: {e}")
|
||||
else:
|
||||
print("📋 No .gitignore found, using minimal exclusion patterns")
|
||||
|
||||
return patterns
|
||||
|
||||
def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
|
||||
"""Check if a file should be excluded based on gitignore-style patterns."""
|
||||
path_str = str(relative_path)
|
||||
|
||||
for pattern in exclude_patterns:
|
||||
# Simple pattern matching (could be enhanced with full gitignore syntax)
|
||||
if pattern.endswith("*"):
|
||||
# Wildcard pattern
|
||||
prefix = pattern[:-1]
|
||||
if path_str.startswith(prefix):
|
||||
return True
|
||||
elif "*" in pattern:
|
||||
# Contains wildcard - simple glob-like matching
|
||||
import fnmatch
|
||||
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return True
|
||||
else:
|
||||
# Exact match or directory match
|
||||
if path_str == pattern or path_str.startswith(pattern + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def list_indexes(self):
|
||||
print("Stored LEANN indexes:")
|
||||
|
||||
@@ -282,34 +341,49 @@ Examples:
|
||||
if custom_file_types:
|
||||
print(f"Using custom file types: {custom_file_types}")
|
||||
|
||||
# Try to use better PDF parsers first
|
||||
# Read .gitignore patterns first
|
||||
exclude_patterns = self._read_gitignore_patterns(docs_dir)
|
||||
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
print(f"Processing PDF: {file_path}")
|
||||
# Check if we should process PDFs
|
||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
if should_process_pdfs:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, exclude_patterns):
|
||||
continue
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
try:
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process {file_path}: {e}")
|
||||
|
||||
# Load other file types with default reader
|
||||
if custom_file_types:
|
||||
@@ -380,6 +454,7 @@ Examples:
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
exclude=exclude_patterns,
|
||||
).load_data(show_progress=True)
|
||||
documents.extend(other_docs)
|
||||
except ValueError as e:
|
||||
@@ -454,7 +529,13 @@ Examples:
|
||||
|
||||
async def build_index(self, args):
|
||||
docs_dir = args.docs
|
||||
index_name = args.index_name
|
||||
# Use current directory name if index_name not provided
|
||||
if args.index_name:
|
||||
index_name = args.index_name
|
||||
else:
|
||||
index_name = Path.cwd().name
|
||||
print(f"Using current directory name as index: '{index_name}'")
|
||||
|
||||
index_dir = self.indexes_dir / index_name
|
||||
index_path = self.get_index_path(index_name)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user