fix: Fix pre-commit issues and update tests

- Fix import sorting and unused imports
- Update type annotations to use built-in types (list, dict) instead of typing.List/Dict
- Fix trailing whitespace and end-of-file issues
- Fix Chinese fullwidth comma to regular comma
- Update test_main_cli.py to test_document_rag.py
- Add backward compatibility test for main_cli_example.py
- Pass all pre-commit hooks (ruff, ruff-format, etc.)
This commit is contained in:
Andy Lee
2025-07-29 10:19:05 -07:00
parent 4e3bcda5fa
commit 3cde4fc7b3
10 changed files with 52 additions and 56 deletions

View File

@@ -61,4 +61,4 @@ This document ensures that the new unified interface maintains exact parameter c
5. **Special Cases**:
- WeChat uses a specific Chinese embedding model
- Email reader includes HTML processing option
- Email reader includes HTML processing option

View File

@@ -4,14 +4,12 @@ Provides common parameters and functionality for all RAG examples.
"""
import argparse
import asyncio
import os
from pathlib import Path
from typing import Optional, List, Dict, Any
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any
import dotenv
from leann.api import LeannBuilder, LeannSearcher, LeannChat
from leann.api import LeannBuilder, LeannChat
from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv()
@@ -129,11 +127,11 @@ class BaseRAGExample(ABC):
pass
@abstractmethod
async def load_data(self, args) -> List[str]:
async def load_data(self, args) -> list[str]:
"""Load data from the source. Returns list of text chunks."""
pass
def get_llm_config(self, args) -> Dict[str, Any]:
def get_llm_config(self, args) -> dict[str, Any]:
"""Get LLM configuration based on arguments."""
config = {"type": args.llm}
@@ -147,7 +145,7 @@ class BaseRAGExample(ABC):
return config
async def build_index(self, args, texts: List[str]) -> str:
async def build_index(self, args, texts: list[str]) -> str:
"""Build LEANN index from texts."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
@@ -256,7 +254,7 @@ class BaseRAGExample(ABC):
await self.run_interactive_chat(args, index_path)
def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> List[str]:
def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> list[str]:
"""Helper function to create text chunks from documents."""
node_parser = SentenceSplitter(
chunk_size=chunk_size,

View File

@@ -6,7 +6,6 @@ Supports Chrome browser history.
import os
import sys
from pathlib import Path
from typing import List
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +51,7 @@ class BrowserRAG(BaseRAGExample):
else:
raise ValueError(f"Unsupported platform: {sys.platform}")
def _find_chrome_profiles(self) -> List[Path]:
def _find_chrome_profiles(self) -> list[Path]:
"""Auto-detect all Chrome profiles."""
base_path = self._get_chrome_base_path()
if not base_path.exists():
@@ -73,7 +72,7 @@ class BrowserRAG(BaseRAGExample):
return profiles
async def load_data(self, args) -> List[str]:
async def load_data(self, args) -> list[str]:
"""Load browser history and convert to text chunks."""
# Determine Chrome profiles
if args.chrome_profile and not args.auto_find_profiles:

View File

@@ -5,7 +5,6 @@ Supports PDF, TXT, MD, and other document formats.
import sys
from pathlib import Path
from typing import List
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -16,52 +15,46 @@ from llama_index.core import SimpleDirectoryReader
class DocumentRAG(BaseRAGExample):
"""RAG example for document processing (PDF, TXT, MD, etc.)."""
def __init__(self):
super().__init__(
name="Document",
description="Process and query documents (PDF, TXT, MD, etc.) with LEANN",
default_index_name="test_doc_files" # Match original main_cli_example.py default
default_index_name="test_doc_files", # Match original main_cli_example.py default
)
def _add_specific_arguments(self, parser):
"""Add document-specific arguments."""
doc_group = parser.add_argument_group('Document Parameters')
doc_group = parser.add_argument_group("Document Parameters")
doc_group.add_argument(
"--data-dir",
type=str,
default="examples/data",
help="Directory containing documents to index (default: examples/data)"
help="Directory containing documents to index (default: examples/data)",
)
doc_group.add_argument(
"--file-types",
nargs="+",
default=[".pdf", ".txt", ".md"],
help="File types to process (default: .pdf .txt .md)"
help="File types to process (default: .pdf .txt .md)",
)
doc_group.add_argument(
"--chunk-size",
type=int,
default=256,
help="Text chunk size (default: 256)"
"--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
)
doc_group.add_argument(
"--chunk-overlap",
type=int,
default=128,
help="Text chunk overlap (default: 128)"
"--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)"
)
async def load_data(self, args) -> List[str]:
async def load_data(self, args) -> list[str]:
"""Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}")
print(f"File types: {args.file_types}")
# Check if data directory exists
data_path = Path(args.data_dir)
if not data_path.exists():
raise ValueError(f"Data directory not found: {args.data_dir}")
# Load documents
documents = SimpleDirectoryReader(
args.data_dir,
@@ -69,31 +62,29 @@ class DocumentRAG(BaseRAGExample):
encoding="utf-8",
required_exts=args.file_types,
).load_data(show_progress=True)
if not documents:
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
return []
print(f"Loaded {len(documents)} documents")
# Convert to text chunks
all_texts = create_text_chunks(
documents,
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap
documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
)
# Apply max_items limit if specified
if args.max_items > 0 and len(all_texts) > args.max_items:
print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})")
all_texts = all_texts[:args.max_items]
all_texts = all_texts[: args.max_items]
return all_texts
if __name__ == "__main__":
import asyncio
# Example queries for document RAG
print("\n📄 Document RAG Example")
print("=" * 50)
@@ -102,6 +93,6 @@ if __name__ == "__main__":
print("- 'Summarize the key findings in these papers'")
print("- 'What is the storage reduction achieved by LEANN?'")
print("\nOr run without --query for interactive mode\n")
rag = DocumentRAG()
asyncio.run(rag.run())
asyncio.run(rag.run())

View File

@@ -3,10 +3,8 @@ Email RAG example using the unified interface.
Supports Apple Mail on macOS.
"""
import os
import sys
from pathlib import Path
from typing import List
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -39,7 +37,7 @@ class EmailRAG(BaseRAGExample):
"--include-html", action="store_true", help="Include HTML content in email processing"
)
def _find_mail_directories(self) -> List[Path]:
def _find_mail_directories(self) -> list[Path]:
"""Auto-detect all Apple Mail directories."""
mail_base = Path.home() / "Library" / "Mail"
if not mail_base.exists():
@@ -53,7 +51,7 @@ class EmailRAG(BaseRAGExample):
return messages_dirs
async def load_data(self, args) -> List[str]:
async def load_data(self, args) -> list[str]:
"""Load emails and convert to text chunks."""
# Determine mail directories
if args.mail_path:

View File

@@ -5,7 +5,6 @@ This file is kept for backward compatibility.
"""
import sys
import os
print("=" * 70)
print("NOTICE: This script has been replaced!")

View File

@@ -6,7 +6,6 @@ Supports WeChat chat history export and search.
import subprocess
import sys
from pathlib import Path
from typing import List
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -84,7 +83,7 @@ class WeChatRAG(BaseRAGExample):
print(f"Export error: {e}")
return False
async def load_data(self, args) -> List[str]:
async def load_data(self, args) -> list[str]:
"""Load WeChat history and convert to text chunks."""
export_path = Path(args.export_dir)
@@ -145,7 +144,7 @@ if __name__ == "__main__":
print("\nExample queries you can try:")
print("- 'Show me conversations about travel plans'")
print("- 'Find group chats about weekend activities'")
print("- '我想买魔术师约翰逊的球衣给我一些对应聊天记录?'")
print("- '我想买魔术师约翰逊的球衣,给我一些对应聊天记录?'")
print("- 'What did we discuss about the project last month?'")
print("\nNote: WeChat must be running for export to work\n")