From 274bbb19eaff317178cd7c3c8118a2ebc3737cc0 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Tue, 29 Jul 2025 18:31:56 -0700
Subject: [PATCH] feat: Add chunk-size parameters and improve file type
 filtering

- Add --chunk-size and --chunk-overlap parameters to all RAG examples
- Preserve original default values for each data source:
  - Document: 256/128 (optimized for general documents)
  - Email: 256/25 (smaller overlap for email threads)
  - Browser: 256/128 (standard for web content)
  - WeChat: 192/64 (smaller chunks for chat messages)
- Make --file-types optional filter instead of restriction in document_rag
- Update README to clarify interactive mode and parameter usage
- Fix LLM default model documentation (gpt-4o, not gpt-4o-mini)
---
 README.md                | 40 ++++++++++++++++++++--------------------
 examples/browser_rag.py  | 10 +++++++++-
 examples/document_rag.py | 25 ++++++++++++++++---------
 examples/email_rag.py    | 10 +++++++++-
 examples/wechat_rag.py   | 10 +++++++++-
 5 files changed, 63 insertions(+), 32 deletions(-)
diff --git a/README.md b/README.md
index 1a683a9..cf67920 100755
--- a/README.md
+++ b/README.md
@@ -173,22 +173,22 @@ LEANN provides flexible parameters for embedding models, search strategies, and
 <details>
 <summary><strong>📋 Click to expand: Common Parameters (Available in All Examples)</strong></summary>
 
-All RAG examples share these common parameters:
+All RAG examples share these common parameters. **Interactive mode** is available in all examples - simply run without `--query` to start a continuous Q&A session where you can ask multiple questions. Type 'quit' to exit.
 
 ```bash
-# Core Parameters
+# Core Parameters (General preprocessing for all examples)
 --index-dir DIR          # Directory to store the index (default: current directory)
---query "YOUR QUESTION"  # Single query to run (interactive mode if omitted)
---max-items N           # Max items to process (default: 1000, -1 for all)
+--query "YOUR QUESTION"  # Single query mode. Omit for interactive chat (type 'quit' to exit)
+--max-items N           # Limit data preprocessing (default: 1000 items, use -1 to process all data)
 --force-rebuild         # Force rebuild index even if it exists
 
 # Embedding Parameters
 --embedding-model MODEL  # e.g., facebook/contriever, text-embedding-3-small
 --embedding-mode MODE    # sentence-transformers, openai, or mlx
 
-# LLM Parameters
---llm TYPE              # openai, ollama, or hf
---llm-model MODEL       # e.g., gpt-4o, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct
+# LLM Parameters (Text generation models)
+--llm TYPE              # LLM backend: openai, ollama, or hf (default: openai)
+--llm-model MODEL       # Model name (default: gpt-4o) e.g., gpt-4o-mini, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct
 
 # Search Parameters
 --top-k N               # Number of results to retrieve (default: 20)
@@ -198,8 +198,8 @@ All RAG examples share these common parameters:
 --backend-name NAME     # Backend to use: hnsw or diskann (default: hnsw)
 --graph-degree N        # Graph degree for index construction (default: 32)
 --build-complexity N    # Build complexity for index construction (default: 64)
---no-compact           # Disable compact index storage
---no-recompute         # Disable embedding recomputation
+--no-compact           # Disable compact index storage (compact storage IS enabled to save storage by default)
+--no-recompute         # Disable embedding recomputation (recomputation IS enabled to save storage by default)
 ```
 
 </details>
@@ -225,18 +225,18 @@ python ./examples/document_rag.py --query "What are the main techniques LEANN ex
 #### Parameters
 ```bash
 --data-dir DIR           # Directory containing documents to process (default: examples/data)
---file-types .ext .ext   # File extensions to process (default: .pdf .txt .md)
---chunk-size N          # Size of text chunks (default: 256)
---chunk-overlap N       # Overlap between chunks (default: 25)
+--file-types .ext .ext   # Filter by specific file types (optional - all LlamaIndex supported types if omitted)
+--chunk-size N          # Size of text chunks (default: 256) - larger for papers, smaller for code
+--chunk-overlap N       # Overlap between chunks (default: 128)
 ```
 
 #### Example Commands
 ```bash
-# Process your research papers folder
-python examples/document_rag.py --data-dir "~/Documents/Papers" --file-types .pdf
+# Process all documents with larger chunks for academic papers
+python examples/document_rag.py --data-dir "~/Documents/Papers" --chunk-size 1024
 
-# Process code documentation with smaller chunks
-python examples/document_rag.py --data-dir "./docs" --chunk-size 512 --file-types .md .rst
+# Filter only markdown and Python files with smaller chunks
+python examples/document_rag.py --data-dir "./docs" --chunk-size 256 --file-types .md .py
 ```
 
 </details>
@@ -307,11 +307,11 @@ python examples/browser_rag.py --query "Tell me my browser history about machine
 
 #### Example Commands
 ```bash
-# Search work-related browsing in your work profile
-python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Profile 1"
+# Search academic research from your browsing history
+python examples/browser_rag.py --query "arxiv papers machine learning transformer architecture"
 
-# Interactive mode to explore your research history
-python examples/browser_rag.py --query "machine learning papers arxiv"
+# Track competitor analysis across work profile
+python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Work Profile" --max-items 5000
 ```
 
 </details>
diff --git a/examples/browser_rag.py b/examples/browser_rag.py
index 5697d49..6003bd1 100644
--- a/examples/browser_rag.py
+++ b/examples/browser_rag.py
@@ -39,6 +39,12 @@ class BrowserRAG(BaseRAGExample):
             default=True,
             help="Automatically find all Chrome profiles (default: True)",
         )
+        browser_group.add_argument(
+            "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
+        )
+        browser_group.add_argument(
+            "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)"
+        )
 
     def _get_chrome_base_path(self) -> Path:
         """Get the base Chrome profile path based on OS."""
@@ -134,7 +140,9 @@ class BrowserRAG(BaseRAGExample):
         print(f"\nTotal history entries processed: {len(all_documents)}")
 
         # Convert to text chunks
-        all_texts = create_text_chunks(all_documents)
+        all_texts = create_text_chunks(
+            all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
+        )
 
         return all_texts
 
diff --git a/examples/document_rag.py b/examples/document_rag.py
index 3497698..7c7ad57 100644
--- a/examples/document_rag.py
+++ b/examples/document_rag.py
@@ -35,8 +35,8 @@ class DocumentRAG(BaseRAGExample):
         doc_group.add_argument(
             "--file-types",
             nargs="+",
-            default=[".pdf", ".txt", ".md"],
-            help="File types to process (default: .pdf .txt .md)",
+            default=None,
+            help="Filter by file types (e.g., .pdf .txt .md). If not specified, all supported types are processed",
         )
         doc_group.add_argument(
             "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
@@ -48,7 +48,10 @@ class DocumentRAG(BaseRAGExample):
     async def load_data(self, args) -> list[str]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
-        print(f"File types: {args.file_types}")
+        if args.file_types:
+            print(f"Filtering by file types: {args.file_types}")
+        else:
+            print("Processing all supported file types")
 
         # Check if data directory exists
         data_path = Path(args.data_dir)
@@ -56,12 +59,16 @@ class DocumentRAG(BaseRAGExample):
             raise ValueError(f"Data directory not found: {args.data_dir}")
 
         # Load documents
-        documents = SimpleDirectoryReader(
-            args.data_dir,
-            recursive=True,
-            encoding="utf-8",
-            required_exts=args.file_types,
-        ).load_data(show_progress=True)
+        reader_kwargs = {
+            "recursive": True,
+            "encoding": "utf-8",
+        }
+        if args.file_types:
+            reader_kwargs["required_exts"] = args.file_types
+
+        documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
+            show_progress=True
+        )
 
         if not documents:
             print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
diff --git a/examples/email_rag.py b/examples/email_rag.py
index 10ec202..36fdc3f 100644
--- a/examples/email_rag.py
+++ b/examples/email_rag.py
@@ -35,6 +35,12 @@ class EmailRAG(BaseRAGExample):
         email_group.add_argument(
             "--include-html", action="store_true", help="Include HTML content in email processing"
         )
+        email_group.add_argument(
+            "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
+        )
+        email_group.add_argument(
+            "--chunk-overlap", type=int, default=25, help="Text chunk overlap (default: 25)"
+        )
 
     def _find_mail_directories(self) -> list[Path]:
         """Auto-detect all Apple Mail directories."""
@@ -113,7 +119,9 @@ class EmailRAG(BaseRAGExample):
 
         # Convert to text chunks
         # Email reader uses chunk_overlap=25 as in original
-        all_texts = create_text_chunks(all_documents, chunk_overlap=25)
+        all_texts = create_text_chunks(
+            all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
+        )
 
         return all_texts
 
diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py
index 590c61a..aa8b987 100644
--- a/examples/wechat_rag.py
+++ b/examples/wechat_rag.py
@@ -42,6 +42,12 @@ class WeChatRAG(BaseRAGExample):
             action="store_true",
             help="Force re-export of WeChat data even if exports exist",
         )
+        wechat_group.add_argument(
+            "--chunk-size", type=int, default=192, help="Text chunk size (default: 192)"
+        )
+        wechat_group.add_argument(
+            "--chunk-overlap", type=int, default=64, help="Text chunk overlap (default: 64)"
+        )
 
     def _export_wechat_data(self, export_dir: Path) -> bool:
         """Export WeChat data using wechattweak-cli."""
@@ -120,7 +126,9 @@ class WeChatRAG(BaseRAGExample):
             print(f"Loaded {len(documents)} chat entries")
 
             # Convert to text chunks
-            all_texts = create_text_chunks(documents)
+            all_texts = create_text_chunks(
+                documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
+            )
 
             return all_texts