diff --git a/.gitignore b/.gitignore index eb58011..71b34cb 100755 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,10 @@ nprobe_logs/ micro/results micro/contriever-INT8 examples/data/ +!examples/data/2501.14312v1 (1).pdf +!examples/data/2506.08276v1.pdf +!examples/data/PrideandPrejudice.txt +!examples/data/README.md *.qdstrm benchmark_results/ results/ diff --git a/README.md b/README.md index 4ee532f..d78b290 100755 --- a/README.md +++ b/README.md @@ -193,8 +193,6 @@ LEANN can create a searchable index of your Apple Mail emails, allowing you to q # Use default mail path (works for most macOS setups) python examples/mail_reader_leann.py -# Specify your own mail path -python examples/mail_reader_leann.py --mail-path "/Users/yourname/Library/Mail/V10/..." # Run with custom index directory python examples/mail_reader_leann.py --index-dir "./my_mail_index" @@ -206,23 +204,12 @@ python examples/mail_reader_leann.py --max-emails -1 python examples/mail_reader_leann.py --max-emails 1000 # Run a single query -python examples/mail_reader_leann.py --query "Find emails about project deadlines" +python examples/mail_reader_leann.py --query "Whats the number of class recommend to take per semester for incoming EECS students" ``` -#### Finding Your Mail Path -
-🔍 Click to expand: How to find your mail path - -The default mail path is configured for a typical macOS setup. If you need to find your specific mail path: - -1. Open Terminal -2. Run: `find ~/Library/Mail -name "Messages" -type d | head -5` -3. Use the parent directory(ended with Data) of the Messages folder as your `--mail-path` - -
#### Example Queries @@ -230,7 +217,6 @@ The default mail path is configured for a typical macOS setup. If you need to fi 💬 Click to expand: Example queries you can try Once the index is built, you can ask questions like: -- "Show me emails about meeting schedules" - "Find emails from my boss about deadlines" - "What did John say about the project timeline?" - "Show me emails about travel expenses" diff --git a/demo.ipynb b/demo.ipynb index 3f512c5..87378a1 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -9485,6 +9485,45 @@ "[leann_backend_hnsw.hnsw_embedding_server LOG]: Sending distance response with 2 distances\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: Distance calculation E2E time: 0.077484 seconds\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", + "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n", "[leann_backend_hnsw.hnsw_embedding_server LOG]: ZMQ socket timeout, continuing to listen\n" ] } diff --git a/examples/email_data/LEANN_email_reader.py b/examples/email_data/LEANN_email_reader.py index d7db0dd..2c79108 100644 --- a/examples/email_data/LEANN_email_reader.py +++ b/examples/email_data/LEANN_email_reader.py @@ -5,6 +5,22 @@ from typing import List, Any from llama_index.core import Document from llama_index.core.readers.base import BaseReader +def find_all_messages_directories(root: str = None) -> List[Path]: + """ + Recursively find all 'Messages' directories under the given root. + Returns a list of Path objects. + """ + if root is None: + # Auto-detect user's mail path + home_dir = os.path.expanduser("~") + root = os.path.join(home_dir, "Library", "Mail") + + messages_dirs = [] + for dirpath, dirnames, filenames in os.walk(root): + if os.path.basename(dirpath) == "Messages": + messages_dirs.append(Path(dirpath)) + return messages_dirs + class EmlxReader(BaseReader): """ Apple Mail .emlx file reader with embedded metadata. @@ -105,31 +121,4 @@ Date: {date} continue print(f"Loaded {len(docs)} email documents") - return docs - - @staticmethod - def find_all_messages_directories(base_path: str) -> List[Path]: - """ - Find all Messages directories under the given base path. - - Args: - base_path: Base path to search for Messages directories - - Returns: - List of Path objects pointing to Messages directories - """ - base_path_obj = Path(base_path) - messages_dirs = [] - - if not base_path_obj.exists(): - print(f"Base path {base_path} does not exist") - return messages_dirs - - # Find all Messages directories recursively - for messages_dir in base_path_obj.rglob("Messages"): - if messages_dir.is_dir(): - messages_dirs.append(messages_dir) - print(f"Found Messages directory: {messages_dir}") - - print(f"Found {len(messages_dirs)} Messages directories") - return messages_dirs \ No newline at end of file + return docs \ No newline at end of file diff --git a/examples/mail_reader_leann.py b/examples/mail_reader_leann.py index 7842e87..4c1a990 100644 --- a/examples/mail_reader_leann.py +++ b/examples/mail_reader_leann.py @@ -15,8 +15,14 @@ from llama_index.core.node_parser import SentenceSplitter dotenv.load_dotenv() +# Auto-detect user's mail path +def get_mail_path(): + """Get the mail path for the current user""" + home_dir = os.path.expanduser("~") + return os.path.join(home_dir, "Library", "Mail") + # Default mail path for macOS -DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" +# DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False): """ @@ -223,8 +229,8 @@ async def query_leann_index(index_path: str, query: str): async def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index') - parser.add_argument('--mail-path', type=str, default=DEFAULT_MAIL_PATH, - help=f'Path to mail data directory (default: {DEFAULT_MAIL_PATH})') + # Remove --mail-path argument and auto-detect all Messages directories + # Remove DEFAULT_MAIL_PATH parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts", help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)') parser.add_argument('--max-emails', type=int, default=1000, @@ -238,24 +244,24 @@ async def main(): print(f"args: {args}") - # Base path to the mail data directory - base_mail_path = args.mail_path + # Automatically find all Messages directories under the current user's Mail directory + from examples.email_data.LEANN_email_reader import find_all_messages_directories + mail_path = get_mail_path() + print(f"Searching for email data in: {mail_path}") + messages_dirs = find_all_messages_directories(mail_path) - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") + print('len(messages_dirs): ', len(messages_dirs)) - print(f"Using mail path: {base_mail_path}") - print(f"Index directory: {INDEX_DIR}") - - # Find all Messages directories - - from examples.email_data.LEANN_email_reader import EmlxReader - messages_dirs = EmlxReader.find_all_messages_directories(base_mail_path) if not messages_dirs: print("No Messages directories found. Exiting.") return + INDEX_DIR = Path(args.index_dir) + INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") + print(f"Index directory: {INDEX_DIR}") + print(f"Found {len(messages_dirs)} Messages directories.") + # Create or load the LEANN index from all sources index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html) @@ -270,7 +276,6 @@ async def main(): "how's the icloud related advertisement saying", "Whats the number of class recommend to take per semester for incoming EECS students" ] - for query in queries: print("\n" + "="*60) await query_leann_index(index_path, query)