From 04c9684488b0cb047d41dc2ec168571bb4b9bc6e Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Wed, 9 Jul 2025 15:01:16 -0700 Subject: [PATCH] add email test code --- .gitignore | 3 + examples/mail_reader_embedded_metadata.py | 207 +++++++++++++++++++++ test/check_embedding_dimension.py | 24 +++ test/check_embedding_model.py | 20 ++ test/mail_reader_llamaindex.py | 147 +++++++++++++++ test/mail_reader_save_load.py | 213 ++++++++++++++++++++++ test/mail_reader_small_chunks.py | 211 +++++++++++++++++++++ test/mail_reader_test.py | 147 +++++++++++++++ test/query_saved_index.py | 99 ++++++++++ 9 files changed, 1071 insertions(+) create mode 100644 examples/mail_reader_embedded_metadata.py create mode 100644 test/check_embedding_dimension.py create mode 100644 test/check_embedding_model.py create mode 100644 test/mail_reader_llamaindex.py create mode 100644 test/mail_reader_save_load.py create mode 100644 test/mail_reader_small_chunks.py create mode 100644 test/mail_reader_test.py create mode 100644 test/query_saved_index.py diff --git a/.gitignore b/.gitignore index 147b001..26740c5 100755 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,9 @@ scripts/ lm_eval.egg-info/ demo/experiment_results/**/*.json *.jsonl +*.eml +*.emlx +*.json *.sh *.txt !CMakeLists.txt diff --git a/examples/mail_reader_embedded_metadata.py b/examples/mail_reader_embedded_metadata.py new file mode 100644 index 0000000..741c620 --- /dev/null +++ b/examples/mail_reader_embedded_metadata.py @@ -0,0 +1,207 @@ +import os +import email +from pathlib import Path +from typing import List, Any +from llama_index.core import VectorStoreIndex, Document, StorageContext +from llama_index.core.readers.base import BaseReader +from llama_index.core.node_parser import SentenceSplitter + +class EmlxReader(BaseReader): + """ + Apple Mail .emlx file reader with embedded metadata. + + Reads individual .emlx files from Apple Mail's storage format. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]: + """ + Load data from the input directory containing .emlx files. + + Args: + input_dir: Directory containing .emlx files + **load_kwargs: + max_count (int): Maximum amount of messages to read. + """ + docs: List[Document] = [] + max_count = load_kwargs.get('max_count', 1000) + count = 0 + + # Walk through the directory recursively + for dirpath, dirnames, filenames in os.walk(input_dir): + # Skip hidden directories + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + + for filename in filenames: + if count >= max_count: + break + + if filename.endswith(".emlx"): + filepath = os.path.join(dirpath, filename) + try: + # Read the .emlx file + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # .emlx files have a length prefix followed by the email content + # The first line contains the length, followed by the email + lines = content.split('\n', 1) + if len(lines) >= 2: + email_content = lines[1] + + # Parse the email using Python's email module + try: + msg = email.message_from_string(email_content) + + # Extract email metadata + subject = msg.get('Subject', 'No Subject') + from_addr = msg.get('From', 'Unknown') + to_addr = msg.get('To', 'Unknown') + date = msg.get('Date', 'Unknown') + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html": + body += part.get_payload(decode=True).decode('utf-8', errors='ignore') + # break + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + + # Create document content with metadata embedded in text + doc_content = f""" +[EMAIL METADATA] +File: {filename} +From: {from_addr} +To: {to_addr} +Subject: {subject} +Date: {date} +[END METADATA] + +{body} +""" + + # No separate metadata - everything is in the text + doc = Document(text=doc_content, metadata={}) + docs.append(doc) + count += 1 + + except Exception as e: + print(f"Error parsing email from {filepath}: {e}") + continue + + except Exception as e: + print(f"Error reading file {filepath}: {e}") + continue + + print(f"Loaded {len(docs)} email documents") + return docs + +def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000): + """ + Create the index from mail data and save it to disk. + + Args: + mail_path: Path to the mail directory + save_dir: Directory to save the index + max_count: Maximum number of emails to process + """ + print("Creating index from mail data with embedded metadata...") + + # Load documents + documents = EmlxReader().load_data(mail_path, max_count=max_count) + + if not documents: + print("No documents loaded. Exiting.") + return None + + # Create text splitter with small chunk size (no metadata constraints) + text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) + + # Create index + index = VectorStoreIndex.from_documents( + documents, + transformations=[text_splitter] + ) + + # Save the index + os.makedirs(save_dir, exist_ok=True) + index.storage_context.persist(persist_dir=save_dir) + print(f"Index saved to {save_dir}") + + return index + +def load_index(save_dir: str = "mail_index_embedded"): + """ + Load the saved index from disk. + + Args: + save_dir: Directory where the index is saved + + Returns: + Loaded index or None if loading fails + """ + try: + # Load storage context + storage_context = StorageContext.from_defaults(persist_dir=save_dir) + + # Load index + index = VectorStoreIndex.from_vector_store( + storage_context.vector_store, + storage_context=storage_context + ) + + print(f"Index loaded from {save_dir}") + return index + + except Exception as e: + print(f"Error loading index: {e}") + return None + +def query_index(index, query: str): + """ + Query the loaded index. + + Args: + index: The loaded index + query: The query string + """ + if index is None: + print("No index available for querying.") + return + + query_engine = index.as_query_engine() + response = query_engine.query(query) + print(f"Query: {query}") + print(f"Response: {response}") + +def main(): + mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" + save_dir = "mail_index_embedded" + + # Check if index already exists + if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): + print("Loading existing index...") + index = load_index(save_dir) + else: + print("Creating new index...") + index = create_and_save_index(mail_path, save_dir, max_count=10000) + + if index: + # Example queries + queries = [ + "Hows Berkeley Graduate Student Instructor", + "What emails mention GSR appointments?", + "Find emails about deadlines" + ] + + for query in queries: + print("\n" + "="*50) + query_index(index, query) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/check_embedding_dimension.py b/test/check_embedding_dimension.py new file mode 100644 index 0000000..1ad92b2 --- /dev/null +++ b/test/check_embedding_dimension.py @@ -0,0 +1,24 @@ +from llama_index.core import VectorStoreIndex, Document +from llama_index.core.embeddings import resolve_embed_model + +# Check the default embedding model +embed_model = resolve_embed_model("default") +print(f"Default embedding model: {embed_model}") + +# Create a simple test document +doc = Document(text="This is a test document") + +# Get embedding dimension +try: + # Test embedding + test_embedding = embed_model.get_text_embedding("test") + print(f"Embedding dimension: {len(test_embedding)}") + print(f"Embedding type: {type(test_embedding)}") +except Exception as e: + print(f"Error getting embedding: {e}") + +# Alternative way to check dimension +if hasattr(embed_model, 'embed_dim'): + print(f"Model embed_dim attribute: {embed_model.embed_dim}") +elif hasattr(embed_model, 'dimension'): + print(f"Model dimension attribute: {embed_model.dimension}") \ No newline at end of file diff --git a/test/check_embedding_model.py b/test/check_embedding_model.py new file mode 100644 index 0000000..ce44882 --- /dev/null +++ b/test/check_embedding_model.py @@ -0,0 +1,20 @@ +from llama_index.core import VectorStoreIndex, Document +from llama_index.core.embeddings import resolve_embed_model + +# Check the default embedding model +embed_model = resolve_embed_model("default") +print(f"Default embedding model: {embed_model}") + +# Create a simple test +doc = Document(text="This is a test document") +index = VectorStoreIndex.from_documents([doc]) + +# Get the embedding model from the index +index_embed_model = index.embed_model +print(f"Index embedding model: {index_embed_model}") + +# Check if it's OpenAI or local +if hasattr(index_embed_model, 'model_name'): + print(f"Model name: {index_embed_model.model_name}") +else: + print(f"Embedding model type: {type(index_embed_model)}") \ No newline at end of file diff --git a/test/mail_reader_llamaindex.py b/test/mail_reader_llamaindex.py new file mode 100644 index 0000000..d0a8bdc --- /dev/null +++ b/test/mail_reader_llamaindex.py @@ -0,0 +1,147 @@ +import os +import email +from pathlib import Path +from typing import List, Any +from llama_index.core import VectorStoreIndex, Document +from llama_index.core.readers.base import BaseReader + +class EmlxReader(BaseReader): + """ + Apple Mail .emlx file reader. + + Reads individual .emlx files from Apple Mail's storage format. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]: + """ + Load data from the input directory containing .emlx files. + + Args: + input_dir: Directory containing .emlx files + **load_kwargs: + max_count (int): Maximum amount of messages to read. + """ + docs: List[Document] = [] + max_count = load_kwargs.get('max_count', 1000) + count = 0 + + # Walk through the directory recursively + for dirpath, dirnames, filenames in os.walk(input_dir): + # Skip hidden directories + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + + for filename in filenames: + if count >= max_count: + break + + if filename.endswith(".emlx"): + filepath = os.path.join(dirpath, filename) + try: + # Read the .emlx file + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # .emlx files have a length prefix followed by the email content + # The first line contains the length, followed by the email + lines = content.split('\n', 1) + if len(lines) >= 2: + email_content = lines[1] + + # Parse the email using Python's email module + try: + msg = email.message_from_string(email_content) + + # Extract email metadata + subject = msg.get('Subject', 'No Subject') + from_addr = msg.get('From', 'Unknown') + to_addr = msg.get('To', 'Unknown') + date = msg.get('Date', 'Unknown') + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html": + body += part.get_payload(decode=True).decode('utf-8', errors='ignore') + # break + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + + # Create document content + doc_content = f""" +From: {from_addr} +To: {to_addr} +Subject: {subject} +Date: {date} + +{body} +""" + + # Create metadata + metadata = { + 'file_path': filepath, + 'subject': subject, + 'from': from_addr, + 'to': to_addr, + 'date': date, + 'filename': filename + } + if count == 0: + print("--------------------------------") + print('dir path', dirpath) + print(metadata) + print(doc_content) + print("--------------------------------") + body=[] + if msg.is_multipart(): + for part in msg.walk(): + print("-------------------------------- get content type -------------------------------") + print(part.get_content_type()) + print(part) + # body.append(part.get_payload(decode=True).decode('utf-8', errors='ignore')) + print("-------------------------------- get content type -------------------------------") + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + print(body) + + print(body) + print("--------------------------------") + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + except Exception as e: + print(f"!!!!!!! Error parsing email from {filepath}: {e} !!!!!!!!") + continue + + except Exception as e: + print(f"!!!!!!! Error reading file !!!!!!!! {filepath}: {e}") + continue + + print(f"Loaded {len(docs)} email documents") + return docs + +# Use the custom EmlxReader instead of MboxReader +documents = EmlxReader().load_data( + "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages", + max_count=1000 +) # Returns list of documents + +# Configure the index with larger chunk size to handle long metadata +from llama_index.core.node_parser import SentenceSplitter + +# Create a custom text splitter with larger chunk size +text_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=200) + +index = VectorStoreIndex.from_documents( + documents, + transformations=[text_splitter] +) # Initialize index with documents + +query_engine = index.as_query_engine() +res = query_engine.query("Hows Berkeley Graduate Student Instructor") +print(res) \ No newline at end of file diff --git a/test/mail_reader_save_load.py b/test/mail_reader_save_load.py new file mode 100644 index 0000000..60329b5 --- /dev/null +++ b/test/mail_reader_save_load.py @@ -0,0 +1,213 @@ +import os +import email +from pathlib import Path +from typing import List, Any +from llama_index.core import VectorStoreIndex, Document, StorageContext +from llama_index.core.readers.base import BaseReader +from llama_index.core.node_parser import SentenceSplitter + +class EmlxReader(BaseReader): + """ + Apple Mail .emlx file reader. + + Reads individual .emlx files from Apple Mail's storage format. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]: + """ + Load data from the input directory containing .emlx files. + + Args: + input_dir: Directory containing .emlx files + **load_kwargs: + max_count (int): Maximum amount of messages to read. + """ + docs: List[Document] = [] + max_count = load_kwargs.get('max_count', 1000) + count = 0 + + # Walk through the directory recursively + for dirpath, dirnames, filenames in os.walk(input_dir): + # Skip hidden directories + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + + for filename in filenames: + if count >= max_count: + break + + if filename.endswith(".emlx"): + filepath = os.path.join(dirpath, filename) + try: + # Read the .emlx file + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # .emlx files have a length prefix followed by the email content + # The first line contains the length, followed by the email + lines = content.split('\n', 1) + if len(lines) >= 2: + email_content = lines[1] + + # Parse the email using Python's email module + try: + msg = email.message_from_string(email_content) + + # Extract email metadata + subject = msg.get('Subject', 'No Subject') + from_addr = msg.get('From', 'Unknown') + to_addr = msg.get('To', 'Unknown') + date = msg.get('Date', 'Unknown') + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_payload(decode=True).decode('utf-8', errors='ignore') + break + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + + # Create document content + doc_content = f""" +From: {from_addr} +To: {to_addr} +Subject: {subject} +Date: {date} + +{body} +""" + + # Create metadata + metadata = { + 'file_path': filepath, + 'subject': subject, + 'from': from_addr, + 'to': to_addr, + 'date': date, + 'filename': filename + } + + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + except Exception as e: + print(f"Error parsing email from {filepath}: {e}") + continue + + except Exception as e: + print(f"Error reading file {filepath}: {e}") + continue + + print(f"Loaded {len(docs)} email documents") + return docs + +def create_and_save_index(mail_path: str, save_dir: str = "mail_index", max_count: int = 1000): + """ + Create the index from mail data and save it to disk. + + Args: + mail_path: Path to the mail directory + save_dir: Directory to save the index + max_count: Maximum number of emails to process + """ + print("Creating index from mail data...") + + # Load documents + documents = EmlxReader().load_data(mail_path, max_count=max_count) + + if not documents: + print("No documents loaded. Exiting.") + return None + + # Create text splitter + text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=0) + + # Create index + index = VectorStoreIndex.from_documents( + documents, + transformations=[text_splitter] + ) + + # Save the index + os.makedirs(save_dir, exist_ok=True) + index.storage_context.persist(persist_dir=save_dir) + print(f"Index saved to {save_dir}") + + return index + +def load_index(save_dir: str = "mail_index"): + """ + Load the saved index from disk. + + Args: + save_dir: Directory where the index is saved + + Returns: + Loaded index or None if loading fails + """ + try: + # Load storage context + storage_context = StorageContext.from_defaults(persist_dir=save_dir) + + # Load index + index = VectorStoreIndex.from_vector_store( + storage_context.vector_store, + storage_context=storage_context + ) + + print(f"Index loaded from {save_dir}") + return index + + except Exception as e: + print(f"Error loading index: {e}") + return None + +def query_index(index, query: str): + """ + Query the loaded index. + + Args: + index: The loaded index + query: The query string + """ + if index is None: + print("No index available for querying.") + return + + query_engine = index.as_query_engine() + response = query_engine.query(query) + print(f"Query: {query}") + print(f"Response: {response}") + +def main(): + mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" + save_dir = "mail_index" + + # Check if index already exists + if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): + print("Loading existing index...") + index = load_index(save_dir) + else: + print("Creating new index...") + index = create_and_save_index(mail_path, save_dir, max_count=1000) + + if index: + # Example queries + queries = [ + "Hows Berkeley Graduate Student Instructor", + "What emails mention GSR appointments?", + "Find emails about deadlines" + ] + + for query in queries: + print("\n" + "="*50) + query_index(index, query) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/mail_reader_small_chunks.py b/test/mail_reader_small_chunks.py new file mode 100644 index 0000000..024a1d1 --- /dev/null +++ b/test/mail_reader_small_chunks.py @@ -0,0 +1,211 @@ +import os +import email +from pathlib import Path +from typing import List, Any +from llama_index.core import VectorStoreIndex, Document, StorageContext +from llama_index.core.readers.base import BaseReader +from llama_index.core.node_parser import SentenceSplitter + +class EmlxReader(BaseReader): + """ + Apple Mail .emlx file reader with reduced metadata. + + Reads individual .emlx files from Apple Mail's storage format. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]: + """ + Load data from the input directory containing .emlx files. + + Args: + input_dir: Directory containing .emlx files + **load_kwargs: + max_count (int): Maximum amount of messages to read. + """ + docs: List[Document] = [] + max_count = load_kwargs.get('max_count', 1000) + count = 0 + + # Walk through the directory recursively + for dirpath, dirnames, filenames in os.walk(input_dir): + # Skip hidden directories + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + + for filename in filenames: + if count >= max_count: + break + + if filename.endswith(".emlx"): + filepath = os.path.join(dirpath, filename) + try: + # Read the .emlx file + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # .emlx files have a length prefix followed by the email content + # The first line contains the length, followed by the email + lines = content.split('\n', 1) + if len(lines) >= 2: + email_content = lines[1] + + # Parse the email using Python's email module + try: + msg = email.message_from_string(email_content) + + # Extract email metadata + subject = msg.get('Subject', 'No Subject') + from_addr = msg.get('From', 'Unknown') + to_addr = msg.get('To', 'Unknown') + date = msg.get('Date', 'Unknown') + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_payload(decode=True).decode('utf-8', errors='ignore') + break + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + + # Create document content with metadata embedded in text + doc_content = f""" +From: {from_addr} +To: {to_addr} +Subject: {subject} +Date: {date} + +{body} +""" + + # Create minimal metadata (only essential info) + metadata = { + 'subject': subject[:50], # Truncate subject + 'from': from_addr[:30], # Truncate from + 'date': date[:20], # Truncate date + 'filename': filename # Keep filename + } + + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + except Exception as e: + print(f"Error parsing email from {filepath}: {e}") + continue + + except Exception as e: + print(f"Error reading file {filepath}: {e}") + continue + + print(f"Loaded {len(docs)} email documents") + return docs + +def create_and_save_index(mail_path: str, save_dir: str = "mail_index_small", max_count: int = 1000): + """ + Create the index from mail data and save it to disk. + + Args: + mail_path: Path to the mail directory + save_dir: Directory to save the index + max_count: Maximum number of emails to process + """ + print("Creating index from mail data with small chunks...") + + # Load documents + documents = EmlxReader().load_data(mail_path, max_count=max_count) + + if not documents: + print("No documents loaded. Exiting.") + return None + + # Create text splitter with small chunk size + text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50) + + # Create index + index = VectorStoreIndex.from_documents( + documents, + transformations=[text_splitter] + ) + + # Save the index + os.makedirs(save_dir, exist_ok=True) + index.storage_context.persist(persist_dir=save_dir) + print(f"Index saved to {save_dir}") + + return index + +def load_index(save_dir: str = "mail_index_small"): + """ + Load the saved index from disk. + + Args: + save_dir: Directory where the index is saved + + Returns: + Loaded index or None if loading fails + """ + try: + # Load storage context + storage_context = StorageContext.from_defaults(persist_dir=save_dir) + + # Load index + index = VectorStoreIndex.from_vector_store( + storage_context.vector_store, + storage_context=storage_context + ) + + print(f"Index loaded from {save_dir}") + return index + + except Exception as e: + print(f"Error loading index: {e}") + return None + +def query_index(index, query: str): + """ + Query the loaded index. + + Args: + index: The loaded index + query: The query string + """ + if index is None: + print("No index available for querying.") + return + + query_engine = index.as_query_engine() + response = query_engine.query(query) + print(f"Query: {query}") + print(f"Response: {response}") + +def main(): + mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" + save_dir = "mail_index_small" + + # Check if index already exists + if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): + print("Loading existing index...") + index = load_index(save_dir) + else: + print("Creating new index...") + index = create_and_save_index(mail_path, save_dir, max_count=1000) + + if index: + # Example queries + queries = [ + "Hows Berkeley Graduate Student Instructor", + "What emails mention GSR appointments?", + "Find emails about deadlines" + ] + + for query in queries: + print("\n" + "="*50) + query_index(index, query) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/mail_reader_test.py b/test/mail_reader_test.py new file mode 100644 index 0000000..9dfd6b6 --- /dev/null +++ b/test/mail_reader_test.py @@ -0,0 +1,147 @@ +import os +import email +from pathlib import Path +from typing import List, Any +from llama_index.core import VectorStoreIndex, Document +from llama_index.core.readers.base import BaseReader + +class EmlxReader(BaseReader): + """ + Apple Mail .emlx file reader. + + Reads individual .emlx files from Apple Mail's storage format. + """ + + def __init__(self) -> None: + """Initialize.""" + pass + + def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]: + """ + Load data from the input directory containing .emlx files. + + Args: + input_dir: Directory containing .emlx files + **load_kwargs: + max_count (int): Maximum amount of messages to read. + """ + docs: List[Document] = [] + max_count = load_kwargs.get('max_count', 1000) + count = 0 + + # Check if directory exists and is accessible + if not os.path.exists(input_dir): + print(f"Error: Directory '{input_dir}' does not exist") + return docs + + if not os.access(input_dir, os.R_OK): + print(f"Error: Directory '{input_dir}' is not accessible (permission denied)") + print("This is likely due to macOS security restrictions on Mail app data") + return docs + + print(f"Scanning directory: {input_dir}") + + # Walk through the directory recursively + for dirpath, dirnames, filenames in os.walk(input_dir): + # Skip hidden directories + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + + for filename in filenames: + if count >= max_count: + break + + if filename.endswith(".emlx"): + filepath = os.path.join(dirpath, filename) + print(f"Found .emlx file: {filepath}") + try: + # Read the .emlx file + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # .emlx files have a length prefix followed by the email content + # The first line contains the length, followed by the email + lines = content.split('\n', 1) + if len(lines) >= 2: + email_content = lines[1] + + # Parse the email using Python's email module + try: + msg = email.message_from_string(email_content) + + # Extract email metadata + subject = msg.get('Subject', 'No Subject') + from_addr = msg.get('From', 'Unknown') + to_addr = msg.get('To', 'Unknown') + date = msg.get('Date', 'Unknown') + + # Extract email body + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_payload(decode=True).decode('utf-8', errors='ignore') + break + else: + body = msg.get_payload(decode=True).decode('utf-8', errors='ignore') + + # Create document content + doc_content = f""" +From: {from_addr} +To: {to_addr} +Subject: {subject} +Date: {date} + +{body} +""" + + # Create metadata + metadata = { + 'file_path': filepath, + 'subject': subject, + 'from': from_addr, + 'to': to_addr, + 'date': date, + 'filename': filename + } + + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + except Exception as e: + print(f"Error parsing email from {filepath}: {e}") + continue + + except Exception as e: + print(f"Error reading file {filepath}: {e}") + continue + + print(f"Loaded {len(docs)} email documents") + return docs + +def main(): + # Use the current directory where the sample.emlx file is located + current_dir = os.path.dirname(os.path.abspath(__file__)) + + print("Testing EmlxReader with sample .emlx file...") + print(f"Scanning directory: {current_dir}") + + # Use the custom EmlxReader + documents = EmlxReader().load_data(current_dir, max_count=1000) + + if not documents: + print("No documents loaded. Make sure sample.emlx exists in the examples directory.") + return + + print(f"\nSuccessfully loaded {len(documents)} document(s)") + + # Initialize index with documents + index = VectorStoreIndex.from_documents(documents) + query_engine = index.as_query_engine() + + print("\nTesting query: 'Hows Berkeley Graduate Student Instructor'") + res = query_engine.query("Hows Berkeley Graduate Student Instructor") + print(f"Response: {res}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/query_saved_index.py b/test/query_saved_index.py new file mode 100644 index 0000000..ac3989d --- /dev/null +++ b/test/query_saved_index.py @@ -0,0 +1,99 @@ +import os +from llama_index.core import VectorStoreIndex, StorageContext + +def load_index(save_dir: str = "mail_index"): + """ + Load the saved index from disk. + + Args: + save_dir: Directory where the index is saved + + Returns: + Loaded index or None if loading fails + """ + try: + # Load storage context + storage_context = StorageContext.from_defaults(persist_dir=save_dir) + + # Load index + index = VectorStoreIndex.from_vector_store( + storage_context.vector_store, + storage_context=storage_context + ) + + print(f"Index loaded from {save_dir}") + return index + + except Exception as e: + print(f"Error loading index: {e}") + return None + +def query_index(index, query: str): + """ + Query the loaded index. + + Args: + index: The loaded index + query: The query string + """ + if index is None: + print("No index available for querying.") + return + + query_engine = index.as_query_engine() + response = query_engine.query(query) + print(f"\nQuery: {query}") + print(f"Response: {response}") + +def main(): + save_dir = "mail_index" + + # Check if index exists + if not os.path.exists(save_dir) or not os.path.exists(os.path.join(save_dir, "vector_store.json")): + print(f"Index not found in {save_dir}") + print("Please run mail_reader_save_load.py first to create the index.") + return + + # Load the index + index = load_index(save_dir) + + if not index: + print("Failed to load index.") + return + + print("\n" + "="*60) + print("Email Query Interface") + print("="*60) + print("Type 'quit' to exit") + print("Type 'help' for example queries") + print("="*60) + + # Interactive query loop + while True: + try: + query = input("\nEnter your query: ").strip() + + if query.lower() == 'quit': + print("Goodbye!") + break + elif query.lower() == 'help': + print("\nExample queries:") + print("- Hows Berkeley Graduate Student Instructor") + print("- What emails mention GSR appointments?") + print("- Find emails about deadlines") + print("- Search for emails from specific sender") + print("- Find emails about meetings") + continue + elif not query: + continue + + query_index(index, query) + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"Error processing query: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file