upd readme mail application

This commit is contained in:
yichuan520030910320
2025-07-13 21:30:08 -07:00
parent c51d5320fa
commit c17899662f
2 changed files with 107 additions and 27 deletions

View File

@@ -123,26 +123,81 @@ This ensures the generated files are compatible with your system's protobuf libr
### 🔥 Core Features ### 🔥 Core Features
- **📊 Multiple Distance Functions**: L2, Cosine, MIPS (Maximum Inner Product Search) - **🔄 Real-time Embeddings** - Eliminate heavy embedding storage with dynamic computation using optimized ZMQ servers and highly optimized search paradigm (overlapping and batching) with highly optimized embedding engine
- **🏗️ Pluggable Backends**: DiskANN, HNSW/FAISS with unified API - **📈 Scalable Architecture** - Handles millions of documents on consumer hardware; the larger your dataset, the more LEANN can save
- **🔄 Real-time Embeddings**: Dynamic computation using optimized ZMQ servers - **🎯 Graph Pruning** - Advanced techniques to minimize the storage overhead of vector search to a limited footprint
- **📈 Scalable Architecture**: Handles millions of documents on consumer hardware - **🏗️ Pluggable Backends** - DiskANN, HNSW/FAISS with unified API
- **🎯 Graph Pruning**: Advanced techniques for memory-efficient search
### 🛠️ Technical Highlights ### 🛠️ Technical Highlights
- **🔄 Recompute Mode** - Highest accuracy scenarios while eliminating vector storage overhead
- **Zero-copy operations** for maximum performance - **Zero-copy Operations** - Minimize IPC overhead by transferring distances instead of embeddings
- **SIMD-optimized** distance computations (AVX2/AVX512) - **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency
- **Async embedding pipeline** with batched processing - **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional)
- **Memory-mapped indices** for fast startup - **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead
- **Recompute mode** for highest accuracy scenarios - **🚀 MLX Support** - Ultra-fast recompute with quantized embedding models, accelerating building and search by 10-100x
### 🎨 Developer Experience ### 🎨 Developer Experience
- **Simple Python API** - Get started in minutes - **Simple Python API** - Get started in minutes
- **Extensible backend system** - Easy to add new algorithms - **Extensible backend system** - Easy to add new algorithms
- **Comprehensive examples** - From basic usage to production deployment - **Comprehensive examples** - From basic usage to production deployment
- **Rich debugging tools** - Built-in performance profiling
## Applications on your MacBook
### light weight RAG on your apple email
LEANN can create a searchable index of your Apple Mail emails, allowing you to query your email history using natural language.
#### Quick Start
<details>
<summary><strong>📋 Click to expand: Command Examples</strong></summary>
```bash
# Use default mail path (works for most macOS setups)
python examples/mail_reader_leann.py
# Specify your own mail path
python examples/mail_reader_leann.py --mail-path "/Users/yourname/Library/Mail/V10/..."
# Run with custom index directory
python examples/mail_reader_leann.py --index-dir "./my_mail_index"
# Limit number of emails processed (useful for testing)
python examples/mail_reader_leann.py --max-emails 1000
# Run a single query
python examples/mail_reader_leann.py --query "Find emails about project deadlines"
```
</details>
#### Finding Your Mail Path
<details>
<summary><strong>🔍 Click to expand: How to find your mail path</strong></summary>
The default mail path is configured for a typical macOS setup. If you need to find your specific mail path:
1. Open Terminal
2. Run: `find ~/Library/Mail -name "Messages" -type d | head -5`
3. Use the parent directory(ended with Data) of the Messages folder as your `--mail-path`
</details>
#### Example Queries
<details>
<summary><strong>💬 Click to expand: Example queries you can try</strong></summary>
Once the index is built, you can ask questions like:
- "Show me emails about meeting schedules"
- "Find emails from my boss about deadlines"
- "What did John say about the project timeline?"
- "Show me emails about travel expenses"
</details>
## 📊 Benchmarks ## 📊 Benchmarks

View File

@@ -1,6 +1,7 @@
import os import os
import asyncio import asyncio
import dotenv import dotenv
import argparse
from pathlib import Path from pathlib import Path
from typing import List, Any from typing import List, Any
from leann.api import LeannBuilder, LeannSearcher, LeannChat from leann.api import LeannBuilder, LeannSearcher, LeannChat
@@ -8,6 +9,9 @@ from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv() dotenv.load_dotenv()
# Default mail path for macOS
DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data"
def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1): def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1):
""" """
Create LEANN index from multiple mail data sources. Create LEANN index from multiple mail data sources.
@@ -203,12 +207,30 @@ async def query_leann_index(index_path: str, query: str):
print(f"Leann: {chat_response}") print(f"Leann: {chat_response}")
async def main(): async def main():
# Base path to the mail data directory # Parse command line arguments
base_mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
parser.add_argument('--mail-path', type=str, default=DEFAULT_MAIL_PATH,
help=f'Path to mail data directory (default: {DEFAULT_MAIL_PATH})')
parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
parser.add_argument('--max-emails', type=int, default=1000,
help='Maximum number of emails to process (-1 means all)')
parser.add_argument('--query', type=str, default="Give me some funny advertisement about apple or other companies",
help='Single query to run (default: runs example queries)')
INDEX_DIR = Path("./mail_index_leann_raw_text_all_dicts") args = parser.parse_args()
print(f"args: {args}")
# Base path to the mail data directory
base_mail_path = args.mail_path
INDEX_DIR = Path(args.index_dir)
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
print(f"Using mail path: {base_mail_path}")
print(f"Index directory: {INDEX_DIR}")
# Find all Messages directories # Find all Messages directories
from LEANN_email_reader import EmlxReader from LEANN_email_reader import EmlxReader
messages_dirs = EmlxReader.find_all_messages_directories(base_mail_path) messages_dirs = EmlxReader.find_all_messages_directories(base_mail_path)
@@ -218,20 +240,23 @@ async def main():
return return
# Create or load the LEANN index from all sources # Create or load the LEANN index from all sources
index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH) index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails)
if index_path: if index_path:
# Example queries if args.query:
queries = [ # Run single query
"Hows Berkeley Graduate Student Instructor", await query_leann_index(index_path, args.query)
"how's the icloud related advertisement saying", else:
"Whats the number of class recommend to take per semester for incoming EECS students" # Example queries
queries = [
] "Hows Berkeley Graduate Student Instructor",
"how's the icloud related advertisement saying",
for query in queries: "Whats the number of class recommend to take per semester for incoming EECS students"
print("\n" + "="*60) ]
await query_leann_index(index_path, query)
for query in queries:
print("\n" + "="*60)
await query_leann_index(index_path, query)
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())