diff --git a/README.md b/README.md index edfd77f..e4c597b 100755 --- a/README.md +++ b/README.md @@ -292,7 +292,7 @@ Once the index is built, you can ask questions like: -Slack supporting soon! Stay tuned! + ## 🖥️ Command Line Interface @@ -503,6 +503,17 @@ export NCCL_IB_DISABLE=1 export NCCL_NET_PLUGIN=none export NCCL_SOCKET_IFNAME=ens5 ``` --> +## FAQ + +### 1. My building time seems long + +You can speed up the process by using a lightweight embedding model. Add this to your arguments: + +```bash +--embedding-model sentence-transformers/all-MiniLM-L6-v2 +``` +**Model sizes:** `all-MiniLM-L6-v2` (30M parameters), `facebook/contriever` (~100M parameters), `Qwen3-0.6B` (600M parameters) + ## 📈 Roadmap diff --git a/examples/email_data/LEANN_email_reader.py b/examples/email_data/LEANN_email_reader.py index 2c79108..ba441f9 100644 --- a/examples/email_data/LEANN_email_reader.py +++ b/examples/email_data/LEANN_email_reader.py @@ -96,14 +96,12 @@ class EmlxReader(BaseReader): # Create document content with metadata embedded in text doc_content = f""" -[EMAIL METADATA] -File: {filename} -From: {from_addr} -To: {to_addr} -Subject: {subject} -Date: {date} -[END METADATA] - +[File]: {filename} +[From]: {from_addr} +[To]: {to_addr} +[Subject]: {subject} +[Date]: {date} +[EMAIL BODY Start]: {body} """ diff --git a/examples/mail_reader_leann.py b/examples/mail_reader_leann.py index fbdde1b..5a22bf3 100644 --- a/examples/mail_reader_leann.py +++ b/examples/mail_reader_leann.py @@ -74,10 +74,10 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa print("No documents loaded from any source. Exiting.") return None - print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories") + print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks") # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) + text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) # Convert Documents to text strings and chunk them all_texts = [] @@ -85,9 +85,11 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa # Split the document into chunks nodes = text_splitter.get_nodes_from_documents([doc]) for node in nodes: - all_texts.append(node.get_content()) + text = node.get_content() + # text = '[subject] ' + doc.metadata["subject"] + '\n' + text + all_texts.append(text) - print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") + print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks") # Create LEANN index directory @@ -231,7 +233,7 @@ async def main(): parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index') # Remove --mail-path argument and auto-detect all Messages directories # Remove DEFAULT_MAIL_PATH - parser.add_argument('--index-dir', type=str, default="./mail_index_leann_raw_text_all_dicts", + parser.add_argument('--index-dir', type=str, default="./mail_index_leann_debug", help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)') parser.add_argument('--max-emails', type=int, default=1000, help='Maximum number of emails to process (-1 means all)') diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py index 624b596..971ea74 100644 --- a/examples/wechat_history_reader_leann.py +++ b/examples/wechat_history_reader_leann.py @@ -74,7 +74,7 @@ def create_leann_index_from_multiple_wechat_exports( return None print( - f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports" + f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks" ) # Create text splitter with 256 chunk size @@ -90,7 +90,7 @@ def create_leann_index_from_multiple_wechat_exports( all_texts.append(text) print( - f"Created {len(all_texts)} text chunks from {len(all_documents)} documents" + f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks" ) # Create LEANN index directory diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN index 25339b0..af2a264 160000 --- a/packages/leann-backend-diskann/third_party/DiskANN +++ b/packages/leann-backend-diskann/third_party/DiskANN @@ -1 +1 @@ -Subproject commit 25339b03413b5067c25b6092ea3e0f77ef8515c8 +Subproject commit af2a26481e65232b57b82d96e68833cdee9f7635