make the email wonderful format

This commit is contained in:
yichuan520030910320
2025-07-22 21:41:58 -07:00
parent 9efcc6d95c
commit aa9a14a917
5 changed files with 28 additions and 17 deletions

View File

@@ -292,7 +292,7 @@ Once the index is built, you can ask questions like:
</details> </details>
Slack supporting soon! Stay tuned!
## 🖥️ Command Line Interface ## 🖥️ Command Line Interface
@@ -503,6 +503,17 @@ export NCCL_IB_DISABLE=1
export NCCL_NET_PLUGIN=none export NCCL_NET_PLUGIN=none
export NCCL_SOCKET_IFNAME=ens5 export NCCL_SOCKET_IFNAME=ens5
``` --> ``` -->
## FAQ
### 1. My building time seems long
You can speed up the process by using a lightweight embedding model. Add this to your arguments:
```bash
--embedding-model sentence-transformers/all-MiniLM-L6-v2
```
**Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters)
## 📈 Roadmap ## 📈 Roadmap

View File

@@ -96,14 +96,12 @@ class EmlxReader(BaseReader):
# Create document content with metadata embedded in text # Create document content with metadata embedded in text
doc_content = f""" doc_content = f"""
[EMAIL METADATA] [File]: {filename}
File: {filename} [From]: {from_addr}
From: {from_addr} [To]: {to_addr}
To: {to_addr} [Subject]: {subject}
Subject: {subject} [Date]: {date}
Date: {date} [EMAIL BODY Start]:
[END METADATA]
{body} {body}
""" """

View File

@@ -74,10 +74,10 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
print("No documents loaded from any source. Exiting.") print("No documents loaded from any source. Exiting.")
return None return None
print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories") print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks")
# Create text splitter with 256 chunk size # Create text splitter with 256 chunk size
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
# Convert Documents to text strings and chunk them # Convert Documents to text strings and chunk them
all_texts = [] all_texts = []
@@ -85,9 +85,11 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
# Split the document into chunks # Split the document into chunks
nodes = text_splitter.get_nodes_from_documents([doc]) nodes = text_splitter.get_nodes_from_documents([doc])
for node in nodes: for node in nodes:
all_texts.append(node.get_content()) text = node.get_content()
# text = '[subject] ' + doc.metadata["subject"] + '\n' + text
all_texts.append(text)
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks")
# Create LEANN index directory # Create LEANN index directory
@@ -231,7 +233,7 @@ async def main():
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index') parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
# Remove --mail-path argument and auto-detect all Messages directories # Remove --mail-path argument and auto-detect all Messages directories
# Remove DEFAULT_MAIL_PATH # Remove DEFAULT_MAIL_PATH
parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts", parser.add_argument('--index-dir', type=str, default="./mail_index_leann_debug",
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)') help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
parser.add_argument('--max-emails', type=int, default=1000, parser.add_argument('--max-emails', type=int, default=1000,
help='Maximum number of emails to process (-1 means all)') help='Maximum number of emails to process (-1 means all)')

View File

@@ -74,7 +74,7 @@ def create_leann_index_from_multiple_wechat_exports(
return None return None
print( print(
f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports" f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks"
) )
# Create text splitter with 256 chunk size # Create text splitter with 256 chunk size
@@ -90,7 +90,7 @@ def create_leann_index_from_multiple_wechat_exports(
all_texts.append(text) all_texts.append(text)
print( print(
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents" f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks"
) )
# Create LEANN index directory # Create LEANN index directory