align the llamaindex result w leann& test attachment
This commit is contained in:
@@ -100,7 +100,7 @@ async def query_leann_index(index_path: str, query: str):
|
|||||||
async def main():
|
async def main():
|
||||||
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
||||||
|
|
||||||
INDEX_DIR = Path("./mail_index_leann")
|
INDEX_DIR = Path("./mail_index_leann_raw_text")
|
||||||
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
|
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
|
||||||
|
|
||||||
# Create or load the LEANN index
|
# Create or load the LEANN index
|
||||||
@@ -110,7 +110,9 @@ async def main():
|
|||||||
# Example queries
|
# Example queries
|
||||||
queries = [
|
queries = [
|
||||||
"Hows Berkeley Graduate Student Instructor",
|
"Hows Berkeley Graduate Student Instructor",
|
||||||
"how's the icloud related advertisement saying"
|
"how's the icloud related advertisement saying",
|
||||||
|
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for query in queries:
|
for query in queries:
|
||||||
|
|||||||
@@ -6,6 +6,12 @@ from llama_index.core import VectorStoreIndex, Document, StorageContext
|
|||||||
from llama_index.core.readers.base import BaseReader
|
from llama_index.core.readers.base import BaseReader
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
|
|
||||||
|
# --- EMBEDDING MODEL ---
|
||||||
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# --- END EMBEDDING MODEL ---
|
||||||
|
|
||||||
class EmlxReader(BaseReader):
|
class EmlxReader(BaseReader):
|
||||||
"""
|
"""
|
||||||
Apple Mail .emlx file reader with embedded metadata.
|
Apple Mail .emlx file reader with embedded metadata.
|
||||||
@@ -67,6 +73,8 @@ class EmlxReader(BaseReader):
|
|||||||
if msg.is_multipart():
|
if msg.is_multipart():
|
||||||
for part in msg.walk():
|
for part in msg.walk():
|
||||||
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
|
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
|
||||||
|
if part.get_content_type() == "text/html":
|
||||||
|
continue
|
||||||
body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||||
# break
|
# break
|
||||||
else:
|
else:
|
||||||
@@ -102,83 +110,50 @@ Date: {date}
|
|||||||
return docs
|
return docs
|
||||||
|
|
||||||
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000):
|
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000):
|
||||||
"""
|
|
||||||
Create the index from mail data and save it to disk.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
mail_path: Path to the mail directory
|
|
||||||
save_dir: Directory to save the index
|
|
||||||
max_count: Maximum number of emails to process
|
|
||||||
"""
|
|
||||||
print("Creating index from mail data with embedded metadata...")
|
print("Creating index from mail data with embedded metadata...")
|
||||||
|
|
||||||
# Load documents
|
|
||||||
documents = EmlxReader().load_data(mail_path, max_count=max_count)
|
documents = EmlxReader().load_data(mail_path, max_count=max_count)
|
||||||
|
|
||||||
if not documents:
|
if not documents:
|
||||||
print("No documents loaded. Exiting.")
|
print("No documents loaded. Exiting.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Create text splitter with small chunk size (no metadata constraints)
|
|
||||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
||||||
|
# Use facebook/contriever as the embedder
|
||||||
# Create index with Facebook Contriever embedding model
|
|
||||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
||||||
|
|
||||||
embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
|
embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
|
||||||
|
# set on device
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
embed_model._model.to("cuda")
|
||||||
|
# set mps
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
embed_model._model.to("mps")
|
||||||
|
else:
|
||||||
|
embed_model._model.to("cpu")
|
||||||
index = VectorStoreIndex.from_documents(
|
index = VectorStoreIndex.from_documents(
|
||||||
documents,
|
documents,
|
||||||
transformations=[text_splitter],
|
transformations=[text_splitter],
|
||||||
embed_model=embed_model
|
embed_model=embed_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the index
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
index.storage_context.persist(persist_dir=save_dir)
|
index.storage_context.persist(persist_dir=save_dir)
|
||||||
print(f"Index saved to {save_dir}")
|
print(f"Index saved to {save_dir}")
|
||||||
|
|
||||||
return index
|
return index
|
||||||
|
|
||||||
def load_index(save_dir: str = "mail_index_embedded"):
|
def load_index(save_dir: str = "mail_index_embedded"):
|
||||||
"""
|
|
||||||
Load the saved index from disk.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_dir: Directory where the index is saved
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Loaded index or None if loading fails
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
# Load storage context
|
|
||||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||||
|
|
||||||
# Load index
|
|
||||||
index = VectorStoreIndex.from_vector_store(
|
index = VectorStoreIndex.from_vector_store(
|
||||||
storage_context.vector_store,
|
storage_context.vector_store,
|
||||||
storage_context=storage_context
|
storage_context=storage_context
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Index loaded from {save_dir}")
|
print(f"Index loaded from {save_dir}")
|
||||||
return index
|
return index
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error loading index: {e}")
|
print(f"Error loading index: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def query_index(index, query: str):
|
def query_index(index, query: str):
|
||||||
"""
|
|
||||||
Query the loaded index.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
index: The loaded index
|
|
||||||
query: The query string
|
|
||||||
"""
|
|
||||||
if index is None:
|
if index is None:
|
||||||
print("No index available for querying.")
|
print("No index available for querying.")
|
||||||
return
|
return
|
||||||
|
|
||||||
query_engine = index.as_query_engine()
|
query_engine = index.as_query_engine()
|
||||||
response = query_engine.query(query)
|
response = query_engine.query(query)
|
||||||
print(f"Query: {query}")
|
print(f"Query: {query}")
|
||||||
@@ -187,23 +162,18 @@ def query_index(index, query: str):
|
|||||||
def main():
|
def main():
|
||||||
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
||||||
save_dir = "mail_index_embedded"
|
save_dir = "mail_index_embedded"
|
||||||
|
|
||||||
# Check if index already exists
|
|
||||||
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||||
print("Loading existing index...")
|
print("Loading existing index...")
|
||||||
index = load_index(save_dir)
|
index = load_index(save_dir)
|
||||||
else:
|
else:
|
||||||
print("Creating new index...")
|
print("Creating new index...")
|
||||||
index = create_and_save_index(mail_path, save_dir, max_count=10000)
|
index = create_and_save_index(mail_path, save_dir, max_count=10000)
|
||||||
|
|
||||||
if index:
|
if index:
|
||||||
# Example queries
|
|
||||||
queries = [
|
queries = [
|
||||||
"Hows Berkeley Graduate Student Instructor",
|
"Hows Berkeley Graduate Student Instructor",
|
||||||
"What emails mention GSR appointments?",
|
"how's the icloud related advertisement saying"
|
||||||
"Find emails about deadlines"
|
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||||
]
|
]
|
||||||
|
|
||||||
for query in queries:
|
for query in queries:
|
||||||
print("\n" + "="*50)
|
print("\n" + "="*50)
|
||||||
query_index(index, query)
|
query_index(index, query)
|
||||||
|
|||||||
Reference in New Issue
Block a user