""" Document RAG example using the unified interface. Supports PDF, TXT, MD, and other document formats. """ import sys from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) from base_rag_example import BaseRAGExample, create_text_chunks from llama_index.core import SimpleDirectoryReader class DocumentRAG(BaseRAGExample): """RAG example for document processing (PDF, TXT, MD, etc.).""" def __init__(self): super().__init__( name="Document", description="Process and query documents (PDF, TXT, MD, etc.) with LEANN", default_index_name="test_doc_files", ) def _add_specific_arguments(self, parser): """Add document-specific arguments.""" doc_group = parser.add_argument_group("Document Parameters") doc_group.add_argument( "--data-dir", type=str, default="data", help="Directory containing documents to index (default: data)", ) doc_group.add_argument( "--file-types", nargs="+", default=None, help="Filter by file types (e.g., .pdf .txt .md). If not specified, all supported types are processed", ) doc_group.add_argument( "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" ) doc_group.add_argument( "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" ) async def load_data(self, args) -> list[str]: """Load documents and convert to text chunks.""" print(f"Loading documents from: {args.data_dir}") if args.file_types: print(f"Filtering by file types: {args.file_types}") else: print("Processing all supported file types") # Check if data directory exists data_path = Path(args.data_dir) if not data_path.exists(): raise ValueError(f"Data directory not found: {args.data_dir}") # Load documents reader_kwargs = { "recursive": True, "encoding": "utf-8", } if args.file_types: reader_kwargs["required_exts"] = args.file_types documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data( show_progress=True ) if not documents: print(f"No documents found in {args.data_dir} with extensions {args.file_types}") return [] print(f"Loaded {len(documents)} documents") # Convert to text chunks all_texts = create_text_chunks( documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap ) # Apply max_items limit if specified if args.max_items > 0 and len(all_texts) > args.max_items: print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})") all_texts = all_texts[: args.max_items] return all_texts if __name__ == "__main__": import asyncio # Example queries for document RAG print("\n📄 Document RAG Example") print("=" * 50) print("\nExample queries you can try:") print("- 'What are the main techniques LEANN uses?'") print("- 'What is the technique DLPM?'") print("- 'Who does Elizabeth Bennet marry?'") print( "- 'What is the problem of developing pan gu model Huawei meets? (盘古大模型开发中遇到什么问题?)'" ) print("\nOr run without --query for interactive mode\n") rag = DocumentRAG() asyncio.run(rag.run())