- Create BaseRAGExample base class for all RAG examples - Refactor 4 examples to use unified interface: - document_rag.py (replaces main_cli_example.py) - email_rag.py (replaces mail_reader_leann.py) - browser_rag.py (replaces google_history_reader_leann.py) - wechat_rag.py (replaces wechat_history_reader_leann.py) - Maintain 100% parameter compatibility with original files - Add interactive mode support for all examples - Unify parameter names (--max-items replaces --max-emails/--max-entries) - Update README.md with new examples usage - Add PARAMETER_CONSISTENCY.md documenting all parameter mappings - Keep main_cli_example.py for backward compatibility with migration notice All default values, LeannBuilder parameters, and chunking settings remain identical to ensure full compatibility with existing indexes.
107 lines
3.5 KiB
Python
107 lines
3.5 KiB
Python
"""
|
|
Document RAG example using the unified interface.
|
|
Supports PDF, TXT, MD, and other document formats.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from base_rag_example import BaseRAGExample, create_text_chunks
|
|
from llama_index.core import SimpleDirectoryReader
|
|
|
|
|
|
class DocumentRAG(BaseRAGExample):
|
|
"""RAG example for document processing (PDF, TXT, MD, etc.)."""
|
|
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="Document",
|
|
description="Process and query documents (PDF, TXT, MD, etc.) with LEANN",
|
|
default_index_name="test_doc_files" # Match original main_cli_example.py default
|
|
)
|
|
|
|
def _add_specific_arguments(self, parser):
|
|
"""Add document-specific arguments."""
|
|
doc_group = parser.add_argument_group('Document Parameters')
|
|
doc_group.add_argument(
|
|
"--data-dir",
|
|
type=str,
|
|
default="examples/data",
|
|
help="Directory containing documents to index (default: examples/data)"
|
|
)
|
|
doc_group.add_argument(
|
|
"--file-types",
|
|
nargs="+",
|
|
default=[".pdf", ".txt", ".md"],
|
|
help="File types to process (default: .pdf .txt .md)"
|
|
)
|
|
doc_group.add_argument(
|
|
"--chunk-size",
|
|
type=int,
|
|
default=256,
|
|
help="Text chunk size (default: 256)"
|
|
)
|
|
doc_group.add_argument(
|
|
"--chunk-overlap",
|
|
type=int,
|
|
default=128,
|
|
help="Text chunk overlap (default: 128)"
|
|
)
|
|
|
|
async def load_data(self, args) -> List[str]:
|
|
"""Load documents and convert to text chunks."""
|
|
print(f"Loading documents from: {args.data_dir}")
|
|
print(f"File types: {args.file_types}")
|
|
|
|
# Check if data directory exists
|
|
data_path = Path(args.data_dir)
|
|
if not data_path.exists():
|
|
raise ValueError(f"Data directory not found: {args.data_dir}")
|
|
|
|
# Load documents
|
|
documents = SimpleDirectoryReader(
|
|
args.data_dir,
|
|
recursive=True,
|
|
encoding="utf-8",
|
|
required_exts=args.file_types,
|
|
).load_data(show_progress=True)
|
|
|
|
if not documents:
|
|
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
|
|
return []
|
|
|
|
print(f"Loaded {len(documents)} documents")
|
|
|
|
# Convert to text chunks
|
|
all_texts = create_text_chunks(
|
|
documents,
|
|
chunk_size=args.chunk_size,
|
|
chunk_overlap=args.chunk_overlap
|
|
)
|
|
|
|
# Apply max_items limit if specified
|
|
if args.max_items > 0 and len(all_texts) > args.max_items:
|
|
print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})")
|
|
all_texts = all_texts[:args.max_items]
|
|
|
|
return all_texts
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
# Example queries for document RAG
|
|
print("\n📄 Document RAG Example")
|
|
print("=" * 50)
|
|
print("\nExample queries you can try:")
|
|
print("- 'What are the main techniques LEANN uses?'")
|
|
print("- 'Summarize the key findings in these papers'")
|
|
print("- 'What is the storage reduction achieved by LEANN?'")
|
|
print("\nOr run without --query for interactive mode\n")
|
|
|
|
rag = DocumentRAG()
|
|
asyncio.run(rag.run()) |