make the email wonderful format
This commit is contained in:
13
README.md
13
README.md
@@ -292,7 +292,7 @@ Once the index is built, you can ask questions like:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
Slack supporting soon! Stay tuned!
|
|
||||||
|
|
||||||
## 🖥️ Command Line Interface
|
## 🖥️ Command Line Interface
|
||||||
|
|
||||||
@@ -503,6 +503,17 @@ export NCCL_IB_DISABLE=1
|
|||||||
export NCCL_NET_PLUGIN=none
|
export NCCL_NET_PLUGIN=none
|
||||||
export NCCL_SOCKET_IFNAME=ens5
|
export NCCL_SOCKET_IFNAME=ens5
|
||||||
``` -->
|
``` -->
|
||||||
|
## FAQ
|
||||||
|
|
||||||
|
### 1. My building time seems long
|
||||||
|
|
||||||
|
You can speed up the process by using a lightweight embedding model. Add this to your arguments:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--embedding-model sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
```
|
||||||
|
**Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters)
|
||||||
|
|
||||||
|
|
||||||
## 📈 Roadmap
|
## 📈 Roadmap
|
||||||
|
|
||||||
|
|||||||
@@ -96,14 +96,12 @@ class EmlxReader(BaseReader):
|
|||||||
|
|
||||||
# Create document content with metadata embedded in text
|
# Create document content with metadata embedded in text
|
||||||
doc_content = f"""
|
doc_content = f"""
|
||||||
[EMAIL METADATA]
|
[File]: {filename}
|
||||||
File: {filename}
|
[From]: {from_addr}
|
||||||
From: {from_addr}
|
[To]: {to_addr}
|
||||||
To: {to_addr}
|
[Subject]: {subject}
|
||||||
Subject: {subject}
|
[Date]: {date}
|
||||||
Date: {date}
|
[EMAIL BODY Start]:
|
||||||
[END METADATA]
|
|
||||||
|
|
||||||
{body}
|
{body}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -74,10 +74,10 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
|
|||||||
print("No documents loaded from any source. Exiting.")
|
print("No documents loaded from any source. Exiting.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories")
|
print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks")
|
||||||
|
|
||||||
# Create text splitter with 256 chunk size
|
# Create text splitter with 256 chunk size
|
||||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||||
|
|
||||||
# Convert Documents to text strings and chunk them
|
# Convert Documents to text strings and chunk them
|
||||||
all_texts = []
|
all_texts = []
|
||||||
@@ -85,9 +85,11 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
|
|||||||
# Split the document into chunks
|
# Split the document into chunks
|
||||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
all_texts.append(node.get_content())
|
text = node.get_content()
|
||||||
|
# text = '[subject] ' + doc.metadata["subject"] + '\n' + text
|
||||||
|
all_texts.append(text)
|
||||||
|
|
||||||
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
|
print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks")
|
||||||
|
|
||||||
# Create LEANN index directory
|
# Create LEANN index directory
|
||||||
|
|
||||||
@@ -231,7 +233,7 @@ async def main():
|
|||||||
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
|
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
|
||||||
# Remove --mail-path argument and auto-detect all Messages directories
|
# Remove --mail-path argument and auto-detect all Messages directories
|
||||||
# Remove DEFAULT_MAIL_PATH
|
# Remove DEFAULT_MAIL_PATH
|
||||||
parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts",
|
parser.add_argument('--index-dir', type=str, default="./mail_index_leann_debug",
|
||||||
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
|
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
|
||||||
parser.add_argument('--max-emails', type=int, default=1000,
|
parser.add_argument('--max-emails', type=int, default=1000,
|
||||||
help='Maximum number of emails to process (-1 means all)')
|
help='Maximum number of emails to process (-1 means all)')
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports"
|
f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create text splitter with 256 chunk size
|
# Create text splitter with 256 chunk size
|
||||||
@@ -90,7 +90,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
all_texts.append(text)
|
all_texts.append(text)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
|
f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create LEANN index directory
|
# Create LEANN index directory
|
||||||
|
|||||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: 25339b0341...af2a26481e
Reference in New Issue
Block a user