refactor: Unify examples interface with BaseRAGExample

- Create BaseRAGExample base class for all RAG examples - Refactor 4 examples to use unified interface: - document_rag.py (replaces main_cli_example.py) - email_rag.py (replaces mail_reader_leann.py) - browser_rag.py (replaces google_history_reader_leann.py) - wechat_rag.py (replaces wechat_history_reader_leann.py) - Maintain 100% parameter compatibility with original files - Add interactive mode support for all examples - Unify parameter names (--max-items replaces --max-emails/--max-entries) - Update README.md with new examples usage - Add PARAMETER_CONSISTENCY.md documenting all parameter mappings - Keep main_cli_example.py for backward compatibility with migration notice All default values, LeannBuilder parameters, and chunking settings remain identical to ensure full compatibility with existing indexes.
2025-07-28 23:11:16 -07:00
parent 19bcc07814
commit 46f6f76fc3
8 changed files with 988 additions and 180 deletions
--- a/examples/document_rag.py
+++ b/examples/document_rag.py
@@ -0,0 +1,107 @@
+"""
+Document RAG example using the unified interface.
+Supports PDF, TXT, MD, and other document formats.
+"""
+
+import sys
+from pathlib import Path
+from typing import List
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+
+from base_rag_example import BaseRAGExample, create_text_chunks
+from llama_index.core import SimpleDirectoryReader
+
+
+class DocumentRAG(BaseRAGExample):
+    """RAG example for document processing (PDF, TXT, MD, etc.)."""
+    
+    def __init__(self):
+        super().__init__(
+            name="Document",
+            description="Process and query documents (PDF, TXT, MD, etc.) with LEANN",
+            default_index_name="test_doc_files"  # Match original main_cli_example.py default
+        )
+    
+    def _add_specific_arguments(self, parser):
+        """Add document-specific arguments."""
+        doc_group = parser.add_argument_group('Document Parameters')
+        doc_group.add_argument(
+            "--data-dir",
+            type=str,
+            default="examples/data",
+            help="Directory containing documents to index (default: examples/data)"
+        )
+        doc_group.add_argument(
+            "--file-types",
+            nargs="+",
+            default=[".pdf", ".txt", ".md"],
+            help="File types to process (default: .pdf .txt .md)"
+        )
+        doc_group.add_argument(
+            "--chunk-size",
+            type=int,
+            default=256,
+            help="Text chunk size (default: 256)"
+        )
+        doc_group.add_argument(
+            "--chunk-overlap",
+            type=int,
+            default=128,
+            help="Text chunk overlap (default: 128)"
+        )
+    
+    async def load_data(self, args) -> List[str]:
+        """Load documents and convert to text chunks."""
+        print(f"Loading documents from: {args.data_dir}")
+        print(f"File types: {args.file_types}")
+        
+        # Check if data directory exists
+        data_path = Path(args.data_dir)
+        if not data_path.exists():
+            raise ValueError(f"Data directory not found: {args.data_dir}")
+        
+        # Load documents
+        documents = SimpleDirectoryReader(
+            args.data_dir,
+            recursive=True,
+            encoding="utf-8",
+            required_exts=args.file_types,
+        ).load_data(show_progress=True)
+        
+        if not documents:
+            print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
+            return []
+        
+        print(f"Loaded {len(documents)} documents")
+        
+        # Convert to text chunks
+        all_texts = create_text_chunks(
+            documents,
+            chunk_size=args.chunk_size,
+            chunk_overlap=args.chunk_overlap
+        )
+        
+        # Apply max_items limit if specified
+        if args.max_items > 0 and len(all_texts) > args.max_items:
+            print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})")
+            all_texts = all_texts[:args.max_items]
+        
+        return all_texts
+
+
+if __name__ == "__main__":
+    import asyncio
+    
+    # Example queries for document RAG
+    print("\n📄 Document RAG Example")
+    print("=" * 50)
+    print("\nExample queries you can try:")
+    print("- 'What are the main techniques LEANN uses?'")
+    print("- 'Summarize the key findings in these papers'")
+    print("- 'What is the storage reduction achieved by LEANN?'")
+    print("\nOr run without --query for interactive mode\n")
+    
+    rag = DocumentRAG()
+    asyncio.run(rag.run())