Compare commits
1 Commits
mcp-fix
...
fix/claude
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fe942329d6 |
@@ -31,6 +31,8 @@ dependencies = [
|
|||||||
"PyPDF2>=3.0.0",
|
"PyPDF2>=3.0.0",
|
||||||
"pymupdf>=1.23.0",
|
"pymupdf>=1.23.0",
|
||||||
"pdfplumber>=0.10.0",
|
"pdfplumber>=0.10.0",
|
||||||
|
"nbconvert>=7.0.0", # For .ipynb file support
|
||||||
|
"gitignore-parser>=0.1.12", # For proper .gitignore handling
|
||||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -203,62 +203,36 @@ Examples:
|
|||||||
with open(global_registry, "w") as f:
|
with open(global_registry, "w") as f:
|
||||||
json.dump(projects, f, indent=2)
|
json.dump(projects, f, indent=2)
|
||||||
|
|
||||||
def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
|
def _build_gitignore_parser(self, docs_dir: str):
|
||||||
"""Read .gitignore file and return patterns for exclusion."""
|
"""Build gitignore parser using gitignore-parser library."""
|
||||||
gitignore_path = Path(docs_dir) / ".gitignore"
|
from gitignore_parser import parse_gitignore
|
||||||
patterns = []
|
|
||||||
|
|
||||||
# Add some essential patterns that should always be excluded
|
# Try to parse the root .gitignore
|
||||||
essential_patterns = [
|
gitignore_path = Path(docs_dir) / ".gitignore"
|
||||||
".git",
|
|
||||||
".DS_Store",
|
|
||||||
]
|
|
||||||
patterns.extend(essential_patterns)
|
|
||||||
|
|
||||||
if gitignore_path.exists():
|
if gitignore_path.exists():
|
||||||
try:
|
try:
|
||||||
with open(gitignore_path, encoding="utf-8") as f:
|
# gitignore-parser automatically handles all subdirectory .gitignore files!
|
||||||
for line in f:
|
matches = parse_gitignore(str(gitignore_path))
|
||||||
line = line.strip()
|
print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
|
||||||
# Skip empty lines and comments
|
return matches
|
||||||
if line and not line.startswith("#"):
|
|
||||||
# Remove leading slash if present (make it relative)
|
|
||||||
if line.startswith("/"):
|
|
||||||
line = line[1:]
|
|
||||||
patterns.append(line)
|
|
||||||
print(
|
|
||||||
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Warning: Could not read .gitignore: {e}")
|
print(f"Warning: Could not parse .gitignore: {e}")
|
||||||
else:
|
else:
|
||||||
print("📋 No .gitignore found, using minimal exclusion patterns")
|
print("📋 No .gitignore found")
|
||||||
|
|
||||||
return patterns
|
# Fallback: basic pattern matching for essential files
|
||||||
|
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
|
||||||
|
|
||||||
def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
|
def basic_matches(file_path):
|
||||||
"""Check if a file should be excluded based on gitignore-style patterns."""
|
path_parts = Path(file_path).parts
|
||||||
path_str = str(relative_path)
|
return any(part in essential_patterns for part in path_parts)
|
||||||
|
|
||||||
for pattern in exclude_patterns:
|
return basic_matches
|
||||||
# Simple pattern matching (could be enhanced with full gitignore syntax)
|
|
||||||
if pattern.endswith("*"):
|
|
||||||
# Wildcard pattern
|
|
||||||
prefix = pattern[:-1]
|
|
||||||
if path_str.startswith(prefix):
|
|
||||||
return True
|
|
||||||
elif "*" in pattern:
|
|
||||||
# Contains wildcard - simple glob-like matching
|
|
||||||
import fnmatch
|
|
||||||
|
|
||||||
if fnmatch.fnmatch(path_str, pattern):
|
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
|
||||||
return True
|
"""Check if a file should be excluded using gitignore parser."""
|
||||||
else:
|
return gitignore_matches(str(relative_path))
|
||||||
# Exact match or directory match
|
|
||||||
if path_str == pattern or path_str.startswith(pattern + "/"):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def list_indexes(self):
|
def list_indexes(self):
|
||||||
print("Stored LEANN indexes:")
|
print("Stored LEANN indexes:")
|
||||||
@@ -341,8 +315,8 @@ Examples:
|
|||||||
if custom_file_types:
|
if custom_file_types:
|
||||||
print(f"Using custom file types: {custom_file_types}")
|
print(f"Using custom file types: {custom_file_types}")
|
||||||
|
|
||||||
# Read .gitignore patterns first
|
# Build gitignore parser
|
||||||
exclude_patterns = self._read_gitignore_patterns(docs_dir)
|
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||||
|
|
||||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||||
documents = []
|
documents = []
|
||||||
@@ -355,7 +329,7 @@ Examples:
|
|||||||
for file_path in docs_path.rglob("*.pdf"):
|
for file_path in docs_path.rglob("*.pdf"):
|
||||||
# Check if file matches any exclude pattern
|
# Check if file matches any exclude pattern
|
||||||
relative_path = file_path.relative_to(docs_path)
|
relative_path = file_path.relative_to(docs_path)
|
||||||
if self._should_exclude_file(relative_path, exclude_patterns):
|
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"Processing PDF: {file_path}")
|
print(f"Processing PDF: {file_path}")
|
||||||
@@ -449,14 +423,34 @@ Examples:
|
|||||||
]
|
]
|
||||||
# Try to load other file types, but don't fail if none are found
|
# Try to load other file types, but don't fail if none are found
|
||||||
try:
|
try:
|
||||||
|
# Create a custom file filter function using our PathSpec
|
||||||
|
def file_filter(file_path: str) -> bool:
|
||||||
|
"""Return True if file should be included (not excluded)"""
|
||||||
|
try:
|
||||||
|
docs_path_obj = Path(docs_dir)
|
||||||
|
file_path_obj = Path(file_path)
|
||||||
|
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||||
|
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||||
|
except (ValueError, OSError):
|
||||||
|
return True # Include files that can't be processed
|
||||||
|
|
||||||
other_docs = SimpleDirectoryReader(
|
other_docs = SimpleDirectoryReader(
|
||||||
docs_dir,
|
docs_dir,
|
||||||
recursive=True,
|
recursive=True,
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
required_exts=code_extensions,
|
required_exts=code_extensions,
|
||||||
exclude=exclude_patterns,
|
file_extractor={}, # Use default extractors
|
||||||
|
filename_as_id=True,
|
||||||
).load_data(show_progress=True)
|
).load_data(show_progress=True)
|
||||||
documents.extend(other_docs)
|
|
||||||
|
# Filter documents after loading based on gitignore rules
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in other_docs:
|
||||||
|
file_path = doc.metadata.get("file_path", "")
|
||||||
|
if file_filter(file_path):
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
|
||||||
|
documents.extend(filtered_docs)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
if "No files found" in str(e):
|
if "No files found" in str(e):
|
||||||
print("No additional files found for other supported types.")
|
print("No additional files found for other supported types.")
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ dependencies = [
|
|||||||
"pypdfium2>=4.30.0",
|
"pypdfium2>=4.30.0",
|
||||||
# LlamaIndex core and readers - updated versions
|
# LlamaIndex core and readers - updated versions
|
||||||
"llama-index>=0.12.44",
|
"llama-index>=0.12.44",
|
||||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||||
# "llama-index-readers-docling", # Requires Python >= 3.10
|
# "llama-index-readers-docling", # Requires Python >= 3.10
|
||||||
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
||||||
"llama-index-vector-stores-faiss>=0.4.0",
|
"llama-index-vector-stores-faiss>=0.4.0",
|
||||||
@@ -43,6 +43,9 @@ dependencies = [
|
|||||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||||
"psutil>=5.8.0",
|
"psutil>=5.8.0",
|
||||||
|
"pathspec>=0.12.1",
|
||||||
|
"nbconvert>=7.16.6",
|
||||||
|
"gitignore-parser>=0.1.12",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
Reference in New Issue
Block a user