From 274bbb19eaff317178cd7c3c8118a2ebc3737cc0 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 29 Jul 2025 18:31:56 -0700 Subject: [PATCH] feat: Add chunk-size parameters and improve file type filtering - Add --chunk-size and --chunk-overlap parameters to all RAG examples - Preserve original default values for each data source: - Document: 256/128 (optimized for general documents) - Email: 256/25 (smaller overlap for email threads) - Browser: 256/128 (standard for web content) - WeChat: 192/64 (smaller chunks for chat messages) - Make --file-types optional filter instead of restriction in document_rag - Update README to clarify interactive mode and parameter usage - Fix LLM default model documentation (gpt-4o, not gpt-4o-mini) --- README.md | 40 ++++++++++++++++++++-------------------- examples/browser_rag.py | 10 +++++++++- examples/document_rag.py | 25 ++++++++++++++++--------- examples/email_rag.py | 10 +++++++++- examples/wechat_rag.py | 10 +++++++++- 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 1a683a9..cf67920 100755 --- a/README.md +++ b/README.md @@ -173,22 +173,22 @@ LEANN provides flexible parameters for embedding models, search strategies, and
📋 Click to expand: Common Parameters (Available in All Examples) -All RAG examples share these common parameters: +All RAG examples share these common parameters. **Interactive mode** is available in all examples - simply run without `--query` to start a continuous Q&A session where you can ask multiple questions. Type 'quit' to exit. ```bash -# Core Parameters +# Core Parameters (General preprocessing for all examples) --index-dir DIR # Directory to store the index (default: current directory) ---query "YOUR QUESTION" # Single query to run (interactive mode if omitted) ---max-items N # Max items to process (default: 1000, -1 for all) +--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit) +--max-items N # Limit data preprocessing (default: 1000 items, use -1 to process all data) --force-rebuild # Force rebuild index even if it exists # Embedding Parameters --embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small --embedding-mode MODE # sentence-transformers, openai, or mlx -# LLM Parameters ---llm TYPE # openai, ollama, or hf ---llm-model MODEL # e.g., gpt-4o, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct +# LLM Parameters (Text generation models) +--llm TYPE # LLM backend: openai, ollama, or hf (default: openai) +--llm-model MODEL # Model name (default: gpt-4o) e.g., gpt-4o-mini, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct # Search Parameters --top-k N # Number of results to retrieve (default: 20) @@ -198,8 +198,8 @@ All RAG examples share these common parameters: --backend-name NAME # Backend to use: hnsw or diskann (default: hnsw) --graph-degree N # Graph degree for index construction (default: 32) --build-complexity N # Build complexity for index construction (default: 64) ---no-compact # Disable compact index storage ---no-recompute # Disable embedding recomputation +--no-compact # Disable compact index storage (compact storage IS enabled to save storage by default) +--no-recompute # Disable embedding recomputation (recomputation IS enabled to save storage by default) ```
@@ -225,18 +225,18 @@ python ./examples/document_rag.py --query "What are the main techniques LEANN ex #### Parameters ```bash --data-dir DIR # Directory containing documents to process (default: examples/data) ---file-types .ext .ext # File extensions to process (default: .pdf .txt .md) ---chunk-size N # Size of text chunks (default: 256) ---chunk-overlap N # Overlap between chunks (default: 25) +--file-types .ext .ext # Filter by specific file types (optional - all LlamaIndex supported types if omitted) +--chunk-size N # Size of text chunks (default: 256) - larger for papers, smaller for code +--chunk-overlap N # Overlap between chunks (default: 128) ``` #### Example Commands ```bash -# Process your research papers folder -python examples/document_rag.py --data-dir "~/Documents/Papers" --file-types .pdf +# Process all documents with larger chunks for academic papers +python examples/document_rag.py --data-dir "~/Documents/Papers" --chunk-size 1024 -# Process code documentation with smaller chunks -python examples/document_rag.py --data-dir "./docs" --chunk-size 512 --file-types .md .rst +# Filter only markdown and Python files with smaller chunks +python examples/document_rag.py --data-dir "./docs" --chunk-size 256 --file-types .md .py ``` @@ -307,11 +307,11 @@ python examples/browser_rag.py --query "Tell me my browser history about machine #### Example Commands ```bash -# Search work-related browsing in your work profile -python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Profile 1" +# Search academic research from your browsing history +python examples/browser_rag.py --query "arxiv papers machine learning transformer architecture" -# Interactive mode to explore your research history -python examples/browser_rag.py --query "machine learning papers arxiv" +# Track competitor analysis across work profile +python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Work Profile" --max-items 5000 ``` diff --git a/examples/browser_rag.py b/examples/browser_rag.py index 5697d49..6003bd1 100644 --- a/examples/browser_rag.py +++ b/examples/browser_rag.py @@ -39,6 +39,12 @@ class BrowserRAG(BaseRAGExample): default=True, help="Automatically find all Chrome profiles (default: True)", ) + browser_group.add_argument( + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" + ) + browser_group.add_argument( + "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" + ) def _get_chrome_base_path(self) -> Path: """Get the base Chrome profile path based on OS.""" @@ -134,7 +140,9 @@ class BrowserRAG(BaseRAGExample): print(f"\nTotal history entries processed: {len(all_documents)}") # Convert to text chunks - all_texts = create_text_chunks(all_documents) + all_texts = create_text_chunks( + all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) return all_texts diff --git a/examples/document_rag.py b/examples/document_rag.py index 3497698..7c7ad57 100644 --- a/examples/document_rag.py +++ b/examples/document_rag.py @@ -35,8 +35,8 @@ class DocumentRAG(BaseRAGExample): doc_group.add_argument( "--file-types", nargs="+", - default=[".pdf", ".txt", ".md"], - help="File types to process (default: .pdf .txt .md)", + default=None, + help="Filter by file types (e.g., .pdf .txt .md). If not specified, all supported types are processed", ) doc_group.add_argument( "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" @@ -48,7 +48,10 @@ class DocumentRAG(BaseRAGExample): async def load_data(self, args) -> list[str]: """Load documents and convert to text chunks.""" print(f"Loading documents from: {args.data_dir}") - print(f"File types: {args.file_types}") + if args.file_types: + print(f"Filtering by file types: {args.file_types}") + else: + print("Processing all supported file types") # Check if data directory exists data_path = Path(args.data_dir) @@ -56,12 +59,16 @@ class DocumentRAG(BaseRAGExample): raise ValueError(f"Data directory not found: {args.data_dir}") # Load documents - documents = SimpleDirectoryReader( - args.data_dir, - recursive=True, - encoding="utf-8", - required_exts=args.file_types, - ).load_data(show_progress=True) + reader_kwargs = { + "recursive": True, + "encoding": "utf-8", + } + if args.file_types: + reader_kwargs["required_exts"] = args.file_types + + documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data( + show_progress=True + ) if not documents: print(f"No documents found in {args.data_dir} with extensions {args.file_types}") diff --git a/examples/email_rag.py b/examples/email_rag.py index 10ec202..36fdc3f 100644 --- a/examples/email_rag.py +++ b/examples/email_rag.py @@ -35,6 +35,12 @@ class EmailRAG(BaseRAGExample): email_group.add_argument( "--include-html", action="store_true", help="Include HTML content in email processing" ) + email_group.add_argument( + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" + ) + email_group.add_argument( + "--chunk-overlap", type=int, default=25, help="Text chunk overlap (default: 25)" + ) def _find_mail_directories(self) -> list[Path]: """Auto-detect all Apple Mail directories.""" @@ -113,7 +119,9 @@ class EmailRAG(BaseRAGExample): # Convert to text chunks # Email reader uses chunk_overlap=25 as in original - all_texts = create_text_chunks(all_documents, chunk_overlap=25) + all_texts = create_text_chunks( + all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) return all_texts diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py index 590c61a..aa8b987 100644 --- a/examples/wechat_rag.py +++ b/examples/wechat_rag.py @@ -42,6 +42,12 @@ class WeChatRAG(BaseRAGExample): action="store_true", help="Force re-export of WeChat data even if exports exist", ) + wechat_group.add_argument( + "--chunk-size", type=int, default=192, help="Text chunk size (default: 192)" + ) + wechat_group.add_argument( + "--chunk-overlap", type=int, default=64, help="Text chunk overlap (default: 64)" + ) def _export_wechat_data(self, export_dir: Path) -> bool: """Export WeChat data using wechattweak-cli.""" @@ -120,7 +126,9 @@ class WeChatRAG(BaseRAGExample): print(f"Loaded {len(documents)} chat entries") # Convert to text chunks - all_texts = create_text_chunks(documents) + all_texts = create_text_chunks( + documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) return all_texts