Compare commits

..

2 Commits

Author SHA1 Message Date
Andy Lee
fe942329d6 fix: improve gitignore and Jupyter notebook support
- Add nbconvert dependency for .ipynb file support
- Replace manual gitignore parsing with gitignore-parser library
- Proper recursive .gitignore handling (all subdirectories)
- Fix compliance with Git gitignore behavior
- Simplify code and improve reliability

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-10 18:52:55 -07:00
yichuan520030910320
9801aa581b [Readme]update embedding model config according to reddit feedback 2025-08-09 21:33:33 -07:00
6 changed files with 3894 additions and 3537 deletions

View File

@@ -189,7 +189,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
--force-rebuild # Force rebuild index even if it exists --force-rebuild # Force rebuild index even if it exists
# Embedding Parameters # Embedding Parameters
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, or mlx-community/multilingual-e5-base-mlx --embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama --embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
# LLM Parameters (Text generation models) # LLM Parameters (Text generation models)

View File

@@ -222,9 +222,15 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores
3. **Use MLX on Apple Silicon** (optional optimization): 3. **Use MLX on Apple Silicon** (optional optimization):
```bash ```bash
--embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx --embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit
``` ```
MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation
4. **Use Ollama**
```bash
--embedding-mode ollama --embedding-model nomic-embed-text
```
To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
### If Search Quality is Poor ### If Search Quality is Poor
1. **Increase retrieval count**: 1. **Increase retrieval count**:

View File

@@ -31,6 +31,8 @@ dependencies = [
"PyPDF2>=3.0.0", "PyPDF2>=3.0.0",
"pymupdf>=1.23.0", "pymupdf>=1.23.0",
"pdfplumber>=0.10.0", "pdfplumber>=0.10.0",
"nbconvert>=7.0.0", # For .ipynb file support
"gitignore-parser>=0.1.12", # For proper .gitignore handling
"mlx>=0.26.3; sys_platform == 'darwin'", "mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'", "mlx-lm>=0.26.0; sys_platform == 'darwin'",
] ]

View File

@@ -203,62 +203,36 @@ Examples:
with open(global_registry, "w") as f: with open(global_registry, "w") as f:
json.dump(projects, f, indent=2) json.dump(projects, f, indent=2)
def _read_gitignore_patterns(self, docs_dir: str) -> list[str]: def _build_gitignore_parser(self, docs_dir: str):
"""Read .gitignore file and return patterns for exclusion.""" """Build gitignore parser using gitignore-parser library."""
gitignore_path = Path(docs_dir) / ".gitignore" from gitignore_parser import parse_gitignore
patterns = []
# Add some essential patterns that should always be excluded # Try to parse the root .gitignore
essential_patterns = [ gitignore_path = Path(docs_dir) / ".gitignore"
".git",
".DS_Store",
]
patterns.extend(essential_patterns)
if gitignore_path.exists(): if gitignore_path.exists():
try: try:
with open(gitignore_path, encoding="utf-8") as f: # gitignore-parser automatically handles all subdirectory .gitignore files!
for line in f: matches = parse_gitignore(str(gitignore_path))
line = line.strip() print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
# Skip empty lines and comments return matches
if line and not line.startswith("#"):
# Remove leading slash if present (make it relative)
if line.startswith("/"):
line = line[1:]
patterns.append(line)
print(
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
)
except Exception as e: except Exception as e:
print(f"Warning: Could not read .gitignore: {e}") print(f"Warning: Could not parse .gitignore: {e}")
else: else:
print("📋 No .gitignore found, using minimal exclusion patterns") print("📋 No .gitignore found")
return patterns # Fallback: basic pattern matching for essential files
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool: def basic_matches(file_path):
"""Check if a file should be excluded based on gitignore-style patterns.""" path_parts = Path(file_path).parts
path_str = str(relative_path) return any(part in essential_patterns for part in path_parts)
for pattern in exclude_patterns: return basic_matches
# Simple pattern matching (could be enhanced with full gitignore syntax)
if pattern.endswith("*"):
# Wildcard pattern
prefix = pattern[:-1]
if path_str.startswith(prefix):
return True
elif "*" in pattern:
# Contains wildcard - simple glob-like matching
import fnmatch
if fnmatch.fnmatch(path_str, pattern): def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
return True """Check if a file should be excluded using gitignore parser."""
else: return gitignore_matches(str(relative_path))
# Exact match or directory match
if path_str == pattern or path_str.startswith(pattern + "/"):
return True
return False
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:") print("Stored LEANN indexes:")
@@ -341,8 +315,8 @@ Examples:
if custom_file_types: if custom_file_types:
print(f"Using custom file types: {custom_file_types}") print(f"Using custom file types: {custom_file_types}")
# Read .gitignore patterns first # Build gitignore parser
exclude_patterns = self._read_gitignore_patterns(docs_dir) gitignore_matches = self._build_gitignore_parser(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested # Try to use better PDF parsers first, but only if PDFs are requested
documents = [] documents = []
@@ -355,7 +329,7 @@ Examples:
for file_path in docs_path.rglob("*.pdf"): for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern # Check if file matches any exclude pattern
relative_path = file_path.relative_to(docs_path) relative_path = file_path.relative_to(docs_path)
if self._should_exclude_file(relative_path, exclude_patterns): if self._should_exclude_file(relative_path, gitignore_matches):
continue continue
print(f"Processing PDF: {file_path}") print(f"Processing PDF: {file_path}")
@@ -449,14 +423,34 @@ Examples:
] ]
# Try to load other file types, but don't fail if none are found # Try to load other file types, but don't fail if none are found
try: try:
# Create a custom file filter function using our PathSpec
def file_filter(file_path: str) -> bool:
"""Return True if file should be included (not excluded)"""
try:
docs_path_obj = Path(docs_dir)
file_path_obj = Path(file_path)
relative_path = file_path_obj.relative_to(docs_path_obj)
return not self._should_exclude_file(relative_path, gitignore_matches)
except (ValueError, OSError):
return True # Include files that can't be processed
other_docs = SimpleDirectoryReader( other_docs = SimpleDirectoryReader(
docs_dir, docs_dir,
recursive=True, recursive=True,
encoding="utf-8", encoding="utf-8",
required_exts=code_extensions, required_exts=code_extensions,
exclude=exclude_patterns, file_extractor={}, # Use default extractors
filename_as_id=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
documents.extend(other_docs)
# Filter documents after loading based on gitignore rules
filtered_docs = []
for doc in other_docs:
file_path = doc.metadata.get("file_path", "")
if file_filter(file_path):
filtered_docs.append(doc)
documents.extend(filtered_docs)
except ValueError as e: except ValueError as e:
if "No files found" in str(e): if "No files found" in str(e):
print("No additional files found for other supported types.") print("No additional files found for other supported types.")

View File

@@ -32,7 +32,7 @@ dependencies = [
"pypdfium2>=4.30.0", "pypdfium2>=4.30.0",
# LlamaIndex core and readers - updated versions # LlamaIndex core and readers - updated versions
"llama-index>=0.12.44", "llama-index>=0.12.44",
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing "llama-index-readers-file>=0.4.0", # Essential for PDF parsing
# "llama-index-readers-docling", # Requires Python >= 3.10 # "llama-index-readers-docling", # Requires Python >= 3.10
# "llama-index-node-parser-docling", # Requires Python >= 3.10 # "llama-index-node-parser-docling", # Requires Python >= 3.10
"llama-index-vector-stores-faiss>=0.4.0", "llama-index-vector-stores-faiss>=0.4.0",
@@ -43,6 +43,9 @@ dependencies = [
"mlx>=0.26.3; sys_platform == 'darwin'", "mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'", "mlx-lm>=0.26.0; sys_platform == 'darwin'",
"psutil>=5.8.0", "psutil>=5.8.0",
"pathspec>=0.12.1",
"nbconvert>=7.16.6",
"gitignore-parser>=0.1.12",
] ]
[project.optional-dependencies] [project.optional-dependencies]

7318
uv.lock generated
View File

File diff suppressed because it is too large Load Diff