From 3cde4fc7b3ecdfd094814a377e14d46e312d2b27 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 29 Jul 2025 10:19:05 -0700 Subject: [PATCH] fix: Fix pre-commit issues and update tests - Fix import sorting and unused imports - Update type annotations to use built-in types (list, dict) instead of typing.List/Dict - Fix trailing whitespace and end-of-file issues - Fix Chinese fullwidth comma to regular comma - Update test_main_cli.py to test_document_rag.py - Add backward compatibility test for main_cli_example.py - Pass all pre-commit hooks (ruff, ruff-format, etc.) --- README.md | 2 +- examples/PARAMETER_CONSISTENCY.md | 2 +- examples/base_rag_example.py | 16 +++--- examples/browser_rag.py | 5 +- examples/document_rag.py | 53 ++++++++----------- examples/email_rag.py | 6 +-- examples/main_cli_example.py | 1 - examples/wechat_rag.py | 5 +- .../leann-backend-diskann/third_party/DiskANN | 2 +- tests/test_document_rag.py | 16 +++++- 10 files changed, 52 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index df7cff9..3bc8445 100755 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ python ./examples/document_rag.py --query "What are the main techniques LEANN ex --embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small --embedding-mode MODE # sentence-transformers, openai, or mlx -# LLM Parameters +# LLM Parameters --llm TYPE # openai, ollama, or hf --llm-model MODEL # e.g., gpt-4o, llama3.2:1b --top-k N # Number of results to retrieve (default: 20) diff --git a/examples/PARAMETER_CONSISTENCY.md b/examples/PARAMETER_CONSISTENCY.md index 12152ee..794fe2c 100644 --- a/examples/PARAMETER_CONSISTENCY.md +++ b/examples/PARAMETER_CONSISTENCY.md @@ -61,4 +61,4 @@ This document ensures that the new unified interface maintains exact parameter c 5. **Special Cases**: - WeChat uses a specific Chinese embedding model - - Email reader includes HTML processing option \ No newline at end of file + - Email reader includes HTML processing option diff --git a/examples/base_rag_example.py b/examples/base_rag_example.py index 6be5717..a307193 100644 --- a/examples/base_rag_example.py +++ b/examples/base_rag_example.py @@ -4,14 +4,12 @@ Provides common parameters and functionality for all RAG examples. """ import argparse -import asyncio -import os -from pathlib import Path -from typing import Optional, List, Dict, Any from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any import dotenv -from leann.api import LeannBuilder, LeannSearcher, LeannChat +from leann.api import LeannBuilder, LeannChat from llama_index.core.node_parser import SentenceSplitter dotenv.load_dotenv() @@ -129,11 +127,11 @@ class BaseRAGExample(ABC): pass @abstractmethod - async def load_data(self, args) -> List[str]: + async def load_data(self, args) -> list[str]: """Load data from the source. Returns list of text chunks.""" pass - def get_llm_config(self, args) -> Dict[str, Any]: + def get_llm_config(self, args) -> dict[str, Any]: """Get LLM configuration based on arguments.""" config = {"type": args.llm} @@ -147,7 +145,7 @@ class BaseRAGExample(ABC): return config - async def build_index(self, args, texts: List[str]) -> str: + async def build_index(self, args, texts: list[str]) -> str: """Build LEANN index from texts.""" index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") @@ -256,7 +254,7 @@ class BaseRAGExample(ABC): await self.run_interactive_chat(args, index_path) -def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> List[str]: +def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> list[str]: """Helper function to create text chunks from documents.""" node_parser = SentenceSplitter( chunk_size=chunk_size, diff --git a/examples/browser_rag.py b/examples/browser_rag.py index 59d181b..fde0367 100644 --- a/examples/browser_rag.py +++ b/examples/browser_rag.py @@ -6,7 +6,6 @@ Supports Chrome browser history. import os import sys from pathlib import Path -from typing import List # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -52,7 +51,7 @@ class BrowserRAG(BaseRAGExample): else: raise ValueError(f"Unsupported platform: {sys.platform}") - def _find_chrome_profiles(self) -> List[Path]: + def _find_chrome_profiles(self) -> list[Path]: """Auto-detect all Chrome profiles.""" base_path = self._get_chrome_base_path() if not base_path.exists(): @@ -73,7 +72,7 @@ class BrowserRAG(BaseRAGExample): return profiles - async def load_data(self, args) -> List[str]: + async def load_data(self, args) -> list[str]: """Load browser history and convert to text chunks.""" # Determine Chrome profiles if args.chrome_profile and not args.auto_find_profiles: diff --git a/examples/document_rag.py b/examples/document_rag.py index ea73b94..7cff9b9 100644 --- a/examples/document_rag.py +++ b/examples/document_rag.py @@ -5,7 +5,6 @@ Supports PDF, TXT, MD, and other document formats. import sys from pathlib import Path -from typing import List # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -16,52 +15,46 @@ from llama_index.core import SimpleDirectoryReader class DocumentRAG(BaseRAGExample): """RAG example for document processing (PDF, TXT, MD, etc.).""" - + def __init__(self): super().__init__( name="Document", description="Process and query documents (PDF, TXT, MD, etc.) with LEANN", - default_index_name="test_doc_files" # Match original main_cli_example.py default + default_index_name="test_doc_files", # Match original main_cli_example.py default ) - + def _add_specific_arguments(self, parser): """Add document-specific arguments.""" - doc_group = parser.add_argument_group('Document Parameters') + doc_group = parser.add_argument_group("Document Parameters") doc_group.add_argument( "--data-dir", type=str, default="examples/data", - help="Directory containing documents to index (default: examples/data)" + help="Directory containing documents to index (default: examples/data)", ) doc_group.add_argument( "--file-types", nargs="+", default=[".pdf", ".txt", ".md"], - help="File types to process (default: .pdf .txt .md)" + help="File types to process (default: .pdf .txt .md)", ) doc_group.add_argument( - "--chunk-size", - type=int, - default=256, - help="Text chunk size (default: 256)" + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" ) doc_group.add_argument( - "--chunk-overlap", - type=int, - default=128, - help="Text chunk overlap (default: 128)" + "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" ) - - async def load_data(self, args) -> List[str]: + + async def load_data(self, args) -> list[str]: """Load documents and convert to text chunks.""" print(f"Loading documents from: {args.data_dir}") print(f"File types: {args.file_types}") - + # Check if data directory exists data_path = Path(args.data_dir) if not data_path.exists(): raise ValueError(f"Data directory not found: {args.data_dir}") - + # Load documents documents = SimpleDirectoryReader( args.data_dir, @@ -69,31 +62,29 @@ class DocumentRAG(BaseRAGExample): encoding="utf-8", required_exts=args.file_types, ).load_data(show_progress=True) - + if not documents: print(f"No documents found in {args.data_dir} with extensions {args.file_types}") return [] - + print(f"Loaded {len(documents)} documents") - + # Convert to text chunks all_texts = create_text_chunks( - documents, - chunk_size=args.chunk_size, - chunk_overlap=args.chunk_overlap + documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap ) - + # Apply max_items limit if specified if args.max_items > 0 and len(all_texts) > args.max_items: print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})") - all_texts = all_texts[:args.max_items] - + all_texts = all_texts[: args.max_items] + return all_texts if __name__ == "__main__": import asyncio - + # Example queries for document RAG print("\n📄 Document RAG Example") print("=" * 50) @@ -102,6 +93,6 @@ if __name__ == "__main__": print("- 'Summarize the key findings in these papers'") print("- 'What is the storage reduction achieved by LEANN?'") print("\nOr run without --query for interactive mode\n") - + rag = DocumentRAG() - asyncio.run(rag.run()) \ No newline at end of file + asyncio.run(rag.run()) diff --git a/examples/email_rag.py b/examples/email_rag.py index d29aced..450d73c 100644 --- a/examples/email_rag.py +++ b/examples/email_rag.py @@ -3,10 +3,8 @@ Email RAG example using the unified interface. Supports Apple Mail on macOS. """ -import os import sys from pathlib import Path -from typing import List # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -39,7 +37,7 @@ class EmailRAG(BaseRAGExample): "--include-html", action="store_true", help="Include HTML content in email processing" ) - def _find_mail_directories(self) -> List[Path]: + def _find_mail_directories(self) -> list[Path]: """Auto-detect all Apple Mail directories.""" mail_base = Path.home() / "Library" / "Mail" if not mail_base.exists(): @@ -53,7 +51,7 @@ class EmailRAG(BaseRAGExample): return messages_dirs - async def load_data(self, args) -> List[str]: + async def load_data(self, args) -> list[str]: """Load emails and convert to text chunks.""" # Determine mail directories if args.mail_path: diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py index 1438c84..8bdd83e 100644 --- a/examples/main_cli_example.py +++ b/examples/main_cli_example.py @@ -5,7 +5,6 @@ This file is kept for backward compatibility. """ import sys -import os print("=" * 70) print("NOTICE: This script has been replaced!") diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py index 7dad52a..b554929 100644 --- a/examples/wechat_rag.py +++ b/examples/wechat_rag.py @@ -6,7 +6,6 @@ Supports WeChat chat history export and search. import subprocess import sys from pathlib import Path -from typing import List # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -84,7 +83,7 @@ class WeChatRAG(BaseRAGExample): print(f"Export error: {e}") return False - async def load_data(self, args) -> List[str]: + async def load_data(self, args) -> list[str]: """Load WeChat history and convert to text chunks.""" export_path = Path(args.export_dir) @@ -145,7 +144,7 @@ if __name__ == "__main__": print("\nExample queries you can try:") print("- 'Show me conversations about travel plans'") print("- 'Find group chats about weekend activities'") - print("- '我想买魔术师约翰逊的球衣,给我一些对应聊天记录?'") + print("- '我想买魔术师约翰逊的球衣,给我一些对应聊天记录?'") print("- 'What did we discuss about the project last month?'") print("\nNote: WeChat must be running for export to work\n") diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN index af2a264..25339b0 160000 --- a/packages/leann-backend-diskann/third_party/DiskANN +++ b/packages/leann-backend-diskann/third_party/DiskANN @@ -1 +1 @@ -Subproject commit af2a26481e65232b57b82d96e68833cdee9f7635 +Subproject commit 25339b03413b5067c25b6092ea3e0f77ef8515c8 diff --git a/tests/test_document_rag.py b/tests/test_document_rag.py index 30d5a9e..3ce64f4 100644 --- a/tests/test_document_rag.py +++ b/tests/test_document_rag.py @@ -53,7 +53,7 @@ def test_document_rag_simulated(test_data_dir): # Verify output output = result.stdout + result.stderr - assert "Leann index built at" in output or "Using existing index" in output + assert "Index saved to" in output or "Using existing index" in output assert "This is a simulated answer" in output @@ -117,4 +117,16 @@ def test_document_rag_error_handling(test_data_dir): # Should fail with invalid LLM type assert result.returncode != 0 - assert "Unknown LLM type" in result.stderr or "invalid_llm_type" in result.stderr + assert "invalid choice" in result.stderr or "invalid_llm_type" in result.stderr + + +def test_main_cli_backward_compatibility(): + """Test that main_cli_example.py shows migration message.""" + cmd = [sys.executable, "examples/main_cli_example.py", "--help"] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + + # Should exit with error code and show migration message + assert result.returncode != 0 + assert "This script has been replaced" in result.stdout + assert "document_rag.py" in result.stdout