align the llamaindex result w leann& test attachment

This commit is contained in:
yichuan520030910320
2025-07-09 21:42:15 -07:00
parent 558126c46e
commit 8a961f8ab3
2 changed files with 24 additions and 52 deletions

View File

@@ -100,7 +100,7 @@ async def query_leann_index(index_path: str, query: str):
async def main(): async def main():
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
INDEX_DIR = Path("./mail_index_leann") INDEX_DIR = Path("./mail_index_leann_raw_text")
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
# Create or load the LEANN index # Create or load the LEANN index
@@ -110,7 +110,9 @@ async def main():
# Example queries # Example queries
queries = [ queries = [
"Hows Berkeley Graduate Student Instructor", "Hows Berkeley Graduate Student Instructor",
"how's the icloud related advertisement saying" "how's the icloud related advertisement saying",
"Whats the number of class recommend to take per semester for incoming EECS students"
] ]
for query in queries: for query in queries:

View File

@@ -6,6 +6,12 @@ from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
# --- EMBEDDING MODEL ---
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
# --- END EMBEDDING MODEL ---
class EmlxReader(BaseReader): class EmlxReader(BaseReader):
""" """
Apple Mail .emlx file reader with embedded metadata. Apple Mail .emlx file reader with embedded metadata.
@@ -67,6 +73,8 @@ class EmlxReader(BaseReader):
if msg.is_multipart(): if msg.is_multipart():
for part in msg.walk(): for part in msg.walk():
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html": if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
if part.get_content_type() == "text/html":
continue
body += part.get_payload(decode=True).decode('utf-8', errors='ignore') body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
# break # break
else: else:
@@ -102,83 +110,50 @@ Date: {date}
return docs return docs
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000): def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000):
"""
Create the index from mail data and save it to disk.
Args:
mail_path: Path to the mail directory
save_dir: Directory to save the index
max_count: Maximum number of emails to process
"""
print("Creating index from mail data with embedded metadata...") print("Creating index from mail data with embedded metadata...")
# Load documents
documents = EmlxReader().load_data(mail_path, max_count=max_count) documents = EmlxReader().load_data(mail_path, max_count=max_count)
if not documents: if not documents:
print("No documents loaded. Exiting.") print("No documents loaded. Exiting.")
return None return None
# Create text splitter with small chunk size (no metadata constraints)
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
# Use facebook/contriever as the embedder
# Create index with Facebook Contriever embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="facebook/contriever") embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
# set on device
import torch
if torch.cuda.is_available():
embed_model._model.to("cuda")
# set mps
elif torch.backends.mps.is_available():
embed_model._model.to("mps")
else:
embed_model._model.to("cpu")
index = VectorStoreIndex.from_documents( index = VectorStoreIndex.from_documents(
documents, documents,
transformations=[text_splitter], transformations=[text_splitter],
embed_model=embed_model embed_model=embed_model
) )
# Save the index
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
index.storage_context.persist(persist_dir=save_dir) index.storage_context.persist(persist_dir=save_dir)
print(f"Index saved to {save_dir}") print(f"Index saved to {save_dir}")
return index return index
def load_index(save_dir: str = "mail_index_embedded"): def load_index(save_dir: str = "mail_index_embedded"):
"""
Load the saved index from disk.
Args:
save_dir: Directory where the index is saved
Returns:
Loaded index or None if loading fails
"""
try: try:
# Load storage context
storage_context = StorageContext.from_defaults(persist_dir=save_dir) storage_context = StorageContext.from_defaults(persist_dir=save_dir)
# Load index
index = VectorStoreIndex.from_vector_store( index = VectorStoreIndex.from_vector_store(
storage_context.vector_store, storage_context.vector_store,
storage_context=storage_context storage_context=storage_context
) )
print(f"Index loaded from {save_dir}") print(f"Index loaded from {save_dir}")
return index return index
except Exception as e: except Exception as e:
print(f"Error loading index: {e}") print(f"Error loading index: {e}")
return None return None
def query_index(index, query: str): def query_index(index, query: str):
"""
Query the loaded index.
Args:
index: The loaded index
query: The query string
"""
if index is None: if index is None:
print("No index available for querying.") print("No index available for querying.")
return return
query_engine = index.as_query_engine() query_engine = index.as_query_engine()
response = query_engine.query(query) response = query_engine.query(query)
print(f"Query: {query}") print(f"Query: {query}")
@@ -187,23 +162,18 @@ def query_index(index, query: str):
def main(): def main():
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
save_dir = "mail_index_embedded" save_dir = "mail_index_embedded"
# Check if index already exists
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
print("Loading existing index...") print("Loading existing index...")
index = load_index(save_dir) index = load_index(save_dir)
else: else:
print("Creating new index...") print("Creating new index...")
index = create_and_save_index(mail_path, save_dir, max_count=10000) index = create_and_save_index(mail_path, save_dir, max_count=10000)
if index: if index:
# Example queries
queries = [ queries = [
"Hows Berkeley Graduate Student Instructor", "Hows Berkeley Graduate Student Instructor",
"What emails mention GSR appointments?", "how's the icloud related advertisement saying"
"Find emails about deadlines" "Whats the number of class recommend to take per semester for incoming EECS students"
] ]
for query in queries: for query in queries:
print("\n" + "="*50) print("\n" + "="*50)
query_index(index, query) query_index(index, query)