feat: Add chunk-size parameters and improve file type filtering
- Add --chunk-size and --chunk-overlap parameters to all RAG examples - Preserve original default values for each data source: - Document: 256/128 (optimized for general documents) - Email: 256/25 (smaller overlap for email threads) - Browser: 256/128 (standard for web content) - WeChat: 192/64 (smaller chunks for chat messages) - Make --file-types optional filter instead of restriction in document_rag - Update README to clarify interactive mode and parameter usage - Fix LLM default model documentation (gpt-4o, not gpt-4o-mini)
This commit is contained in:
40
README.md
40
README.md
@@ -173,22 +173,22 @@ LEANN provides flexible parameters for embedding models, search strategies, and
|
|||||||
<details>
|
<details>
|
||||||
<summary><strong>📋 Click to expand: Common Parameters (Available in All Examples)</strong></summary>
|
<summary><strong>📋 Click to expand: Common Parameters (Available in All Examples)</strong></summary>
|
||||||
|
|
||||||
All RAG examples share these common parameters:
|
All RAG examples share these common parameters. **Interactive mode** is available in all examples - simply run without `--query` to start a continuous Q&A session where you can ask multiple questions. Type 'quit' to exit.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Core Parameters
|
# Core Parameters (General preprocessing for all examples)
|
||||||
--index-dir DIR # Directory to store the index (default: current directory)
|
--index-dir DIR # Directory to store the index (default: current directory)
|
||||||
--query "YOUR QUESTION" # Single query to run (interactive mode if omitted)
|
--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit)
|
||||||
--max-items N # Max items to process (default: 1000, -1 for all)
|
--max-items N # Limit data preprocessing (default: 1000 items, use -1 to process all data)
|
||||||
--force-rebuild # Force rebuild index even if it exists
|
--force-rebuild # Force rebuild index even if it exists
|
||||||
|
|
||||||
# Embedding Parameters
|
# Embedding Parameters
|
||||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small
|
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small
|
||||||
--embedding-mode MODE # sentence-transformers, openai, or mlx
|
--embedding-mode MODE # sentence-transformers, openai, or mlx
|
||||||
|
|
||||||
# LLM Parameters
|
# LLM Parameters (Text generation models)
|
||||||
--llm TYPE # openai, ollama, or hf
|
--llm TYPE # LLM backend: openai, ollama, or hf (default: openai)
|
||||||
--llm-model MODEL # e.g., gpt-4o, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct
|
--llm-model MODEL # Model name (default: gpt-4o) e.g., gpt-4o-mini, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct
|
||||||
|
|
||||||
# Search Parameters
|
# Search Parameters
|
||||||
--top-k N # Number of results to retrieve (default: 20)
|
--top-k N # Number of results to retrieve (default: 20)
|
||||||
@@ -198,8 +198,8 @@ All RAG examples share these common parameters:
|
|||||||
--backend-name NAME # Backend to use: hnsw or diskann (default: hnsw)
|
--backend-name NAME # Backend to use: hnsw or diskann (default: hnsw)
|
||||||
--graph-degree N # Graph degree for index construction (default: 32)
|
--graph-degree N # Graph degree for index construction (default: 32)
|
||||||
--build-complexity N # Build complexity for index construction (default: 64)
|
--build-complexity N # Build complexity for index construction (default: 64)
|
||||||
--no-compact # Disable compact index storage
|
--no-compact # Disable compact index storage (compact storage IS enabled to save storage by default)
|
||||||
--no-recompute # Disable embedding recomputation
|
--no-recompute # Disable embedding recomputation (recomputation IS enabled to save storage by default)
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
@@ -225,18 +225,18 @@ python ./examples/document_rag.py --query "What are the main techniques LEANN ex
|
|||||||
#### Parameters
|
#### Parameters
|
||||||
```bash
|
```bash
|
||||||
--data-dir DIR # Directory containing documents to process (default: examples/data)
|
--data-dir DIR # Directory containing documents to process (default: examples/data)
|
||||||
--file-types .ext .ext # File extensions to process (default: .pdf .txt .md)
|
--file-types .ext .ext # Filter by specific file types (optional - all LlamaIndex supported types if omitted)
|
||||||
--chunk-size N # Size of text chunks (default: 256)
|
--chunk-size N # Size of text chunks (default: 256) - larger for papers, smaller for code
|
||||||
--chunk-overlap N # Overlap between chunks (default: 25)
|
--chunk-overlap N # Overlap between chunks (default: 128)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example Commands
|
#### Example Commands
|
||||||
```bash
|
```bash
|
||||||
# Process your research papers folder
|
# Process all documents with larger chunks for academic papers
|
||||||
python examples/document_rag.py --data-dir "~/Documents/Papers" --file-types .pdf
|
python examples/document_rag.py --data-dir "~/Documents/Papers" --chunk-size 1024
|
||||||
|
|
||||||
# Process code documentation with smaller chunks
|
# Filter only markdown and Python files with smaller chunks
|
||||||
python examples/document_rag.py --data-dir "./docs" --chunk-size 512 --file-types .md .rst
|
python examples/document_rag.py --data-dir "./docs" --chunk-size 256 --file-types .md .py
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
@@ -307,11 +307,11 @@ python examples/browser_rag.py --query "Tell me my browser history about machine
|
|||||||
|
|
||||||
#### Example Commands
|
#### Example Commands
|
||||||
```bash
|
```bash
|
||||||
# Search work-related browsing in your work profile
|
# Search academic research from your browsing history
|
||||||
python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Profile 1"
|
python examples/browser_rag.py --query "arxiv papers machine learning transformer architecture"
|
||||||
|
|
||||||
# Interactive mode to explore your research history
|
# Track competitor analysis across work profile
|
||||||
python examples/browser_rag.py --query "machine learning papers arxiv"
|
python examples/browser_rag.py --chrome-profile "~/Library/Application Support/Google/Chrome/Work Profile" --max-items 5000
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|||||||
@@ -39,6 +39,12 @@ class BrowserRAG(BaseRAGExample):
|
|||||||
default=True,
|
default=True,
|
||||||
help="Automatically find all Chrome profiles (default: True)",
|
help="Automatically find all Chrome profiles (default: True)",
|
||||||
)
|
)
|
||||||
|
browser_group.add_argument(
|
||||||
|
"--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
|
||||||
|
)
|
||||||
|
browser_group.add_argument(
|
||||||
|
"--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)"
|
||||||
|
)
|
||||||
|
|
||||||
def _get_chrome_base_path(self) -> Path:
|
def _get_chrome_base_path(self) -> Path:
|
||||||
"""Get the base Chrome profile path based on OS."""
|
"""Get the base Chrome profile path based on OS."""
|
||||||
@@ -134,7 +140,9 @@ class BrowserRAG(BaseRAGExample):
|
|||||||
print(f"\nTotal history entries processed: {len(all_documents)}")
|
print(f"\nTotal history entries processed: {len(all_documents)}")
|
||||||
|
|
||||||
# Convert to text chunks
|
# Convert to text chunks
|
||||||
all_texts = create_text_chunks(all_documents)
|
all_texts = create_text_chunks(
|
||||||
|
all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
|
||||||
|
)
|
||||||
|
|
||||||
return all_texts
|
return all_texts
|
||||||
|
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ class DocumentRAG(BaseRAGExample):
|
|||||||
doc_group.add_argument(
|
doc_group.add_argument(
|
||||||
"--file-types",
|
"--file-types",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=[".pdf", ".txt", ".md"],
|
default=None,
|
||||||
help="File types to process (default: .pdf .txt .md)",
|
help="Filter by file types (e.g., .pdf .txt .md). If not specified, all supported types are processed",
|
||||||
)
|
)
|
||||||
doc_group.add_argument(
|
doc_group.add_argument(
|
||||||
"--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
|
"--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
|
||||||
@@ -48,7 +48,10 @@ class DocumentRAG(BaseRAGExample):
|
|||||||
async def load_data(self, args) -> list[str]:
|
async def load_data(self, args) -> list[str]:
|
||||||
"""Load documents and convert to text chunks."""
|
"""Load documents and convert to text chunks."""
|
||||||
print(f"Loading documents from: {args.data_dir}")
|
print(f"Loading documents from: {args.data_dir}")
|
||||||
print(f"File types: {args.file_types}")
|
if args.file_types:
|
||||||
|
print(f"Filtering by file types: {args.file_types}")
|
||||||
|
else:
|
||||||
|
print("Processing all supported file types")
|
||||||
|
|
||||||
# Check if data directory exists
|
# Check if data directory exists
|
||||||
data_path = Path(args.data_dir)
|
data_path = Path(args.data_dir)
|
||||||
@@ -56,12 +59,16 @@ class DocumentRAG(BaseRAGExample):
|
|||||||
raise ValueError(f"Data directory not found: {args.data_dir}")
|
raise ValueError(f"Data directory not found: {args.data_dir}")
|
||||||
|
|
||||||
# Load documents
|
# Load documents
|
||||||
documents = SimpleDirectoryReader(
|
reader_kwargs = {
|
||||||
args.data_dir,
|
"recursive": True,
|
||||||
recursive=True,
|
"encoding": "utf-8",
|
||||||
encoding="utf-8",
|
}
|
||||||
required_exts=args.file_types,
|
if args.file_types:
|
||||||
).load_data(show_progress=True)
|
reader_kwargs["required_exts"] = args.file_types
|
||||||
|
|
||||||
|
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
|
||||||
|
show_progress=True
|
||||||
|
)
|
||||||
|
|
||||||
if not documents:
|
if not documents:
|
||||||
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
|
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
|
||||||
|
|||||||
@@ -35,6 +35,12 @@ class EmailRAG(BaseRAGExample):
|
|||||||
email_group.add_argument(
|
email_group.add_argument(
|
||||||
"--include-html", action="store_true", help="Include HTML content in email processing"
|
"--include-html", action="store_true", help="Include HTML content in email processing"
|
||||||
)
|
)
|
||||||
|
email_group.add_argument(
|
||||||
|
"--chunk-size", type=int, default=256, help="Text chunk size (default: 256)"
|
||||||
|
)
|
||||||
|
email_group.add_argument(
|
||||||
|
"--chunk-overlap", type=int, default=25, help="Text chunk overlap (default: 25)"
|
||||||
|
)
|
||||||
|
|
||||||
def _find_mail_directories(self) -> list[Path]:
|
def _find_mail_directories(self) -> list[Path]:
|
||||||
"""Auto-detect all Apple Mail directories."""
|
"""Auto-detect all Apple Mail directories."""
|
||||||
@@ -113,7 +119,9 @@ class EmailRAG(BaseRAGExample):
|
|||||||
|
|
||||||
# Convert to text chunks
|
# Convert to text chunks
|
||||||
# Email reader uses chunk_overlap=25 as in original
|
# Email reader uses chunk_overlap=25 as in original
|
||||||
all_texts = create_text_chunks(all_documents, chunk_overlap=25)
|
all_texts = create_text_chunks(
|
||||||
|
all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
|
||||||
|
)
|
||||||
|
|
||||||
return all_texts
|
return all_texts
|
||||||
|
|
||||||
|
|||||||
@@ -42,6 +42,12 @@ class WeChatRAG(BaseRAGExample):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Force re-export of WeChat data even if exports exist",
|
help="Force re-export of WeChat data even if exports exist",
|
||||||
)
|
)
|
||||||
|
wechat_group.add_argument(
|
||||||
|
"--chunk-size", type=int, default=192, help="Text chunk size (default: 192)"
|
||||||
|
)
|
||||||
|
wechat_group.add_argument(
|
||||||
|
"--chunk-overlap", type=int, default=64, help="Text chunk overlap (default: 64)"
|
||||||
|
)
|
||||||
|
|
||||||
def _export_wechat_data(self, export_dir: Path) -> bool:
|
def _export_wechat_data(self, export_dir: Path) -> bool:
|
||||||
"""Export WeChat data using wechattweak-cli."""
|
"""Export WeChat data using wechattweak-cli."""
|
||||||
@@ -120,7 +126,9 @@ class WeChatRAG(BaseRAGExample):
|
|||||||
print(f"Loaded {len(documents)} chat entries")
|
print(f"Loaded {len(documents)} chat entries")
|
||||||
|
|
||||||
# Convert to text chunks
|
# Convert to text chunks
|
||||||
all_texts = create_text_chunks(documents)
|
all_texts = create_text_chunks(
|
||||||
|
documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
|
||||||
|
)
|
||||||
|
|
||||||
return all_texts
|
return all_texts
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user