update readme and auto find email

This commit is contained in:
yichuan520030910320
2025-07-17 18:15:17 -07:00
parent 90d9f27383
commit 4f83086788
5 changed files with 81 additions and 58 deletions

4
.gitignore vendored
View File

@@ -36,6 +36,10 @@ nprobe_logs/
micro/results micro/results
micro/contriever-INT8 micro/contriever-INT8
examples/data/ examples/data/
!examples/data/2501.14312v1 (1).pdf
!examples/data/2506.08276v1.pdf
!examples/data/PrideandPrejudice.txt
!examples/data/README.md
*.qdstrm *.qdstrm
benchmark_results/ benchmark_results/
results/ results/

View File

@@ -193,8 +193,6 @@ LEANN can create a searchable index of your Apple Mail emails, allowing you to q
# Use default mail path (works for most macOS setups) # Use default mail path (works for most macOS setups)
python examples/mail_reader_leann.py python examples/mail_reader_leann.py
# Specify your own mail path
python examples/mail_reader_leann.py --mail-path "/Users/yourname/Library/Mail/V10/..."
# Run with custom index directory # Run with custom index directory
python examples/mail_reader_leann.py --index-dir "./my_mail_index" python examples/mail_reader_leann.py --index-dir "./my_mail_index"
@@ -206,23 +204,12 @@ python examples/mail_reader_leann.py --max-emails -1
python examples/mail_reader_leann.py --max-emails 1000 python examples/mail_reader_leann.py --max-emails 1000
# Run a single query # Run a single query
python examples/mail_reader_leann.py --query "Find emails about project deadlines" python examples/mail_reader_leann.py --query "Whats the number of class recommend to take per semester for incoming EECS students"
``` ```
</details> </details>
#### Finding Your Mail Path
<details>
<summary><strong>🔍 Click to expand: How to find your mail path</strong></summary>
The default mail path is configured for a typical macOS setup. If you need to find your specific mail path:
1. Open Terminal
2. Run: `find ~/Library/Mail -name "Messages" -type d | head -5`
3. Use the parent directory(ended with Data) of the Messages folder as your `--mail-path`
</details>
#### Example Queries #### Example Queries
@@ -230,7 +217,6 @@ The default mail path is configured for a typical macOS setup. If you need to fi
<summary><strong>💬 Click to expand: Example queries you can try</strong></summary> <summary><strong>💬 Click to expand: Example queries you can try</strong></summary>
Once the index is built, you can ask questions like: Once the index is built, you can ask questions like:
- "Show me emails about meeting schedules"
- "Find emails from my boss about deadlines" - "Find emails from my boss about deadlines"
- "What did John say about the project timeline?" - "What did John say about the project timeline?"
- "Show me emails about travel expenses" - "Show me emails about travel expenses"

View File

@@ -9485,6 +9485,45 @@
"[leann_backend_hnsw.hnsw_embedding_server LOG]: Sending distance response with 2 distances\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: Sending distance response with 2 distances\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: Distance calculation E2E time: 0.077484 seconds\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: Distance calculation E2E time: 0.077484 seconds\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n",
"[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n" "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n"
] ]
} }

View File

@@ -5,6 +5,22 @@ from typing import List, Any
from llama_index.core import Document from llama_index.core import Document
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
def find_all_messages_directories(root: str = None) -> List[Path]:
"""
Recursively find all 'Messages' directories under the given root.
Returns a list of Path objects.
"""
if root is None:
# Auto-detect user's mail path
home_dir = os.path.expanduser("~")
root = os.path.join(home_dir, "Library", "Mail")
messages_dirs = []
for dirpath, dirnames, filenames in os.walk(root):
if os.path.basename(dirpath) == "Messages":
messages_dirs.append(Path(dirpath))
return messages_dirs
class EmlxReader(BaseReader): class EmlxReader(BaseReader):
""" """
Apple Mail .emlx file reader with embedded metadata. Apple Mail .emlx file reader with embedded metadata.
@@ -105,31 +121,4 @@ Date: {date}
continue continue
print(f"Loaded {len(docs)} email documents") print(f"Loaded {len(docs)} email documents")
return docs return docs
@staticmethod
def find_all_messages_directories(base_path: str) -> List[Path]:
"""
Find all Messages directories under the given base path.
Args:
base_path: Base path to search for Messages directories
Returns:
List of Path objects pointing to Messages directories
"""
base_path_obj = Path(base_path)
messages_dirs = []
if not base_path_obj.exists():
print(f"Base path {base_path} does not exist")
return messages_dirs
# Find all Messages directories recursively
for messages_dir in base_path_obj.rglob("Messages"):
if messages_dir.is_dir():
messages_dirs.append(messages_dir)
print(f"Found Messages directory: {messages_dir}")
print(f"Found {len(messages_dirs)} Messages directories")
return messages_dirs

View File

@@ -15,8 +15,14 @@ from llama_index.core.node_parser import SentenceSplitter
dotenv.load_dotenv() dotenv.load_dotenv()
# Auto-detect user's mail path
def get_mail_path():
"""Get the mail path for the current user"""
home_dir = os.path.expanduser("~")
return os.path.join(home_dir, "Library", "Mail")
# Default mail path for macOS # Default mail path for macOS
DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" # DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data"
def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False): def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False):
""" """
@@ -223,8 +229,8 @@ async def query_leann_index(index_path: str, query: str):
async def main(): async def main():
# Parse command line arguments # Parse command line arguments
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index') parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
parser.add_argument('--mail-path', type=str, default=DEFAULT_MAIL_PATH, # Remove --mail-path argument and auto-detect all Messages directories
help=f'Path to mail data directory (default: {DEFAULT_MAIL_PATH})') # Remove DEFAULT_MAIL_PATH
parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts", parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)') help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
parser.add_argument('--max-emails', type=int, default=1000, parser.add_argument('--max-emails', type=int, default=1000,
@@ -238,24 +244,24 @@ async def main():
print(f"args: {args}") print(f"args: {args}")
# Base path to the mail data directory # Automatically find all Messages directories under the current user's Mail directory
base_mail_path = args.mail_path from examples.email_data.LEANN_email_reader import find_all_messages_directories
mail_path = get_mail_path()
print(f"Searching for email data in: {mail_path}")
messages_dirs = find_all_messages_directories(mail_path)
INDEX_DIR = Path(args.index_dir) print('len(messages_dirs): ', len(messages_dirs))
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
print(f"Using mail path: {base_mail_path}")
print(f"Index directory: {INDEX_DIR}")
# Find all Messages directories
from examples.email_data.LEANN_email_reader import EmlxReader
messages_dirs = EmlxReader.find_all_messages_directories(base_mail_path)
if not messages_dirs: if not messages_dirs:
print("No Messages directories found. Exiting.") print("No Messages directories found. Exiting.")
return return
INDEX_DIR = Path(args.index_dir)
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
print(f"Index directory: {INDEX_DIR}")
print(f"Found {len(messages_dirs)} Messages directories.")
# Create or load the LEANN index from all sources # Create or load the LEANN index from all sources
index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html) index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html)
@@ -270,7 +276,6 @@ async def main():
"how's the icloud related advertisement saying", "how's the icloud related advertisement saying",
"Whats the number of class recommend to take per semester for incoming EECS students" "Whats the number of class recommend to take per semester for incoming EECS students"
] ]
for query in queries: for query in queries:
print("\n" + "="*60) print("\n" + "="*60)
await query_leann_index(index_path, query) await query_leann_index(index_path, query)