From 3cde4fc7b3ecdfd094814a377e14d46e312d2b27 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Tue, 29 Jul 2025 10:19:05 -0700
Subject: [PATCH] fix: Fix pre-commit issues and update tests

- Fix import sorting and unused imports
- Update type annotations to use built-in types (list, dict) instead of typing.List/Dict
- Fix trailing whitespace and end-of-file issues
- Fix Chinese fullwidth comma to regular comma
- Update test_main_cli.py to test_document_rag.py
- Add backward compatibility test for main_cli_example.py
- Pass all pre-commit hooks (ruff, ruff-format, etc.)
---
 README.md                                     |  2 +-
 examples/PARAMETER_CONSISTENCY.md             |  2 +-
 examples/base_rag_example.py                  | 16 +++---
 examples/browser_rag.py                       |  5 +-
 examples/document_rag.py                      | 53 ++++++++-----------
 examples/email_rag.py                         |  6 +--
 examples/main_cli_example.py                  |  1 -
 examples/wechat_rag.py                        |  5 +-
 .../leann-backend-diskann/third_party/DiskANN |  2 +-
 tests/test_document_rag.py                    | 16 +++++-
 10 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/README.md b/README.md
index df7cff9..3bc8445 100755
--- a/README.md
+++ b/README.md
@@ -195,7 +195,7 @@ python ./examples/document_rag.py --query "What are the main techniques LEANN ex
 --embedding-model MODEL  # e.g., facebook/contriever, text-embedding-3-small
 --embedding-mode MODE    # sentence-transformers, openai, or mlx
 
-# LLM Parameters  
+# LLM Parameters
 --llm TYPE              # openai, ollama, or hf
 --llm-model MODEL       # e.g., gpt-4o, llama3.2:1b
 --top-k N               # Number of results to retrieve (default: 20)
diff --git a/examples/PARAMETER_CONSISTENCY.md b/examples/PARAMETER_CONSISTENCY.md
index 12152ee..794fe2c 100644
--- a/examples/PARAMETER_CONSISTENCY.md
+++ b/examples/PARAMETER_CONSISTENCY.md
@@ -61,4 +61,4 @@ This document ensures that the new unified interface maintains exact parameter c
 
 5. **Special Cases**:
    - WeChat uses a specific Chinese embedding model
-   - Email reader includes HTML processing option 
\ No newline at end of file
+   - Email reader includes HTML processing option
diff --git a/examples/base_rag_example.py b/examples/base_rag_example.py
index 6be5717..a307193 100644
--- a/examples/base_rag_example.py
+++ b/examples/base_rag_example.py
@@ -4,14 +4,12 @@ Provides common parameters and functionality for all RAG examples.
 """
 
 import argparse
-import asyncio
-import os
-from pathlib import Path
-from typing import Optional, List, Dict, Any
 from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
 
 import dotenv
-from leann.api import LeannBuilder, LeannSearcher, LeannChat
+from leann.api import LeannBuilder, LeannChat
 from llama_index.core.node_parser import SentenceSplitter
 
 dotenv.load_dotenv()
@@ -129,11 +127,11 @@ class BaseRAGExample(ABC):
         pass
 
     @abstractmethod
-    async def load_data(self, args) -> List[str]:
+    async def load_data(self, args) -> list[str]:
         """Load data from the source. Returns list of text chunks."""
         pass
 
-    def get_llm_config(self, args) -> Dict[str, Any]:
+    def get_llm_config(self, args) -> dict[str, Any]:
         """Get LLM configuration based on arguments."""
         config = {"type": args.llm}
 
@@ -147,7 +145,7 @@ class BaseRAGExample(ABC):
 
         return config
 
-    async def build_index(self, args, texts: List[str]) -> str:
+    async def build_index(self, args, texts: list[str]) -> str:
         """Build LEANN index from texts."""
         index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
 
@@ -256,7 +254,7 @@ class BaseRAGExample(ABC):
             await self.run_interactive_chat(args, index_path)
 
 
-def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> List[str]:
+def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> list[str]:
     """Helper function to create text chunks from documents."""
     node_parser = SentenceSplitter(
         chunk_size=chunk_size,
diff --git a/examples/browser_rag.py b/examples/browser_rag.py
index 59d181b..fde0367 100644
--- a/examples/browser_rag.py
+++ b/examples/browser_rag.py
@@ -6,7 +6,6 @@ Supports Chrome browser history.
 import os
 import sys
 from pathlib import Path
-from typing import List
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +51,7 @@ class BrowserRAG(BaseRAGExample):
         else:
             raise ValueError(f"Unsupported platform: {sys.platform}")
 
-    def _find_chrome_profiles(self) -> List[Path]:
+    def _find_chrome_profiles(self) -> list[Path]:
         """Auto-detect all Chrome profiles."""
         base_path = self._get_chrome_base_path()
         if not base_path.exists():
@@ -73,7 +72,7 @@ class BrowserRAG(BaseRAGExample):
 
         return profiles
 
-    async def load_data(self, args) -> List[str]:
+    async def load_data(self, args) -> list[str]:
         """Load browser history and convert to text chunks."""
         # Determine Chrome profiles
         if args.chrome_profile and not args.auto_find_profiles:
diff --git a/examples/document_rag.py b/examples/document_rag.py
index ea73b94..7cff9b9 100644
--- a/examples/document_rag.py
+++ b/examples/document_rag.py
@@ -5,7 +5,6 @@ Supports PDF, TXT, MD, and other document formats.
 
 import sys
 from pathlib import Path
-from typing import List
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -16,52 +15,46 @@ from llama_index.core import SimpleDirectoryReader
 
 class DocumentRAG(BaseRAGExample):
     """RAG example for document processing (PDF, TXT, MD, etc.)."""
-    
+
     def __init__(self):
         super().__init__(
             name="Document",
             description="Process and query documents (PDF, TXT, MD, etc.) with LEANN",
-            default_index_name="test_doc_files"  # Match original main_cli_example.py default
+            default_index_name="test_doc_files",  # Match original main_cli_example.py default
         )
-    
+
     def _add_specific_arguments(self, parser):
         """Add document-specific arguments."""
-        doc_group = parser.add_argument_group('Document Parameters')
+        doc_group = parser.add_argument_group("Document Parameters")
         doc_group.add_argument(
             "--data-dir",
             type=str,
             default="examples/data",
-            help="Directory containing documents to index (default: examples/data)"
+            help="Directory containing documents to index (default: examples/data)",
         )
         doc_group.add_argument(
             "--file-types",
             nargs="+",
             default=[".pdf", ".txt", ".md"],
-            help="File types to process (default: .pdf .txt .md)"
+            help="File types to process (default: .pdf .txt .md)",
         )
         doc_group.add_argument(
-            "--chunk-size",
-            type=int,
-            default=256,
-            help="Text chunk size (default: 256)"
+            "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
         )
         doc_group.add_argument(
-            "--chunk-overlap",
-            type=int,
-            default=128,
-            help="Text chunk overlap (default: 128)"
+            "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)"
         )
-    
-    async def load_data(self, args) -> List[str]:
+
+    async def load_data(self, args) -> list[str]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
         print(f"File types: {args.file_types}")
-        
+
         # Check if data directory exists
         data_path = Path(args.data_dir)
         if not data_path.exists():
             raise ValueError(f"Data directory not found: {args.data_dir}")
-        
+
         # Load documents
         documents = SimpleDirectoryReader(
             args.data_dir,
@@ -69,31 +62,29 @@ class DocumentRAG(BaseRAGExample):
             encoding="utf-8",
             required_exts=args.file_types,
         ).load_data(show_progress=True)
-        
+
         if not documents:
             print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
             return []
-        
+
         print(f"Loaded {len(documents)} documents")
-        
+
         # Convert to text chunks
         all_texts = create_text_chunks(
-            documents,
-            chunk_size=args.chunk_size,
-            chunk_overlap=args.chunk_overlap
+            documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
         )
-        
+
         # Apply max_items limit if specified
         if args.max_items > 0 and len(all_texts) > args.max_items:
             print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})")
-            all_texts = all_texts[:args.max_items]
-        
+            all_texts = all_texts[: args.max_items]
+
         return all_texts
 
 
 if __name__ == "__main__":
     import asyncio
-    
+
     # Example queries for document RAG
     print("\n📄 Document RAG Example")
     print("=" * 50)
@@ -102,6 +93,6 @@ if __name__ == "__main__":
     print("- 'Summarize the key findings in these papers'")
     print("- 'What is the storage reduction achieved by LEANN?'")
     print("\nOr run without --query for interactive mode\n")
-    
+
     rag = DocumentRAG()
-    asyncio.run(rag.run()) 
\ No newline at end of file
+    asyncio.run(rag.run())
diff --git a/examples/email_rag.py b/examples/email_rag.py
index d29aced..450d73c 100644
--- a/examples/email_rag.py
+++ b/examples/email_rag.py
@@ -3,10 +3,8 @@ Email RAG example using the unified interface.
 Supports Apple Mail on macOS.
 """
 
-import os
 import sys
 from pathlib import Path
-from typing import List
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -39,7 +37,7 @@ class EmailRAG(BaseRAGExample):
             "--include-html", action="store_true", help="Include HTML content in email processing"
         )
 
-    def _find_mail_directories(self) -> List[Path]:
+    def _find_mail_directories(self) -> list[Path]:
         """Auto-detect all Apple Mail directories."""
         mail_base = Path.home() / "Library" / "Mail"
         if not mail_base.exists():
@@ -53,7 +51,7 @@ class EmailRAG(BaseRAGExample):
 
         return messages_dirs
 
-    async def load_data(self, args) -> List[str]:
+    async def load_data(self, args) -> list[str]:
         """Load emails and convert to text chunks."""
         # Determine mail directories
         if args.mail_path:
diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py
index 1438c84..8bdd83e 100644
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -5,7 +5,6 @@ This file is kept for backward compatibility.
 """
 
 import sys
-import os
 
 print("=" * 70)
 print("NOTICE: This script has been replaced!")
diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py
index 7dad52a..b554929 100644
--- a/examples/wechat_rag.py
+++ b/examples/wechat_rag.py
@@ -6,7 +6,6 @@ Supports WeChat chat history export and search.
 import subprocess
 import sys
 from pathlib import Path
-from typing import List
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -84,7 +83,7 @@ class WeChatRAG(BaseRAGExample):
             print(f"Export error: {e}")
             return False
 
-    async def load_data(self, args) -> List[str]:
+    async def load_data(self, args) -> list[str]:
         """Load WeChat history and convert to text chunks."""
         export_path = Path(args.export_dir)
 
@@ -145,7 +144,7 @@ if __name__ == "__main__":
     print("\nExample queries you can try:")
     print("- 'Show me conversations about travel plans'")
     print("- 'Find group chats about weekend activities'")
-    print("- '我想买魔术师约翰逊的球衣，给我一些对应聊天记录?'")
+    print("- '我想买魔术师约翰逊的球衣,给我一些对应聊天记录?'")
     print("- 'What did we discuss about the project last month?'")
     print("\nNote: WeChat must be running for export to work\n")
 
diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN
index af2a264..25339b0 160000
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
@@ -1 +1 @@
-Subproject commit af2a26481e65232b57b82d96e68833cdee9f7635
+Subproject commit 25339b03413b5067c25b6092ea3e0f77ef8515c8
diff --git a/tests/test_document_rag.py b/tests/test_document_rag.py
index 30d5a9e..3ce64f4 100644
--- a/tests/test_document_rag.py
+++ b/tests/test_document_rag.py
@@ -53,7 +53,7 @@ def test_document_rag_simulated(test_data_dir):
 
         # Verify output
         output = result.stdout + result.stderr
-        assert "Leann index built at" in output or "Using existing index" in output
+        assert "Index saved to" in output or "Using existing index" in output
         assert "This is a simulated answer" in output
 
 
@@ -117,4 +117,16 @@ def test_document_rag_error_handling(test_data_dir):
 
         # Should fail with invalid LLM type
         assert result.returncode != 0
-        assert "Unknown LLM type" in result.stderr or "invalid_llm_type" in result.stderr
+        assert "invalid choice" in result.stderr or "invalid_llm_type" in result.stderr
+
+
+def test_main_cli_backward_compatibility():
+    """Test that main_cli_example.py shows migration message."""
+    cmd = [sys.executable, "examples/main_cli_example.py", "--help"]
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+    # Should exit with error code and show migration message
+    assert result.returncode != 0
+    assert "This script has been replaced" in result.stdout
+    assert "document_rag.py" in result.stdout