From 32364320f8d2e7b3abf814753e06afd7f6df6c9a Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Mon, 21 Jul 2025 16:22:16 -0700 Subject: [PATCH] update wechat and we should fix the bug introduced in 1c5fec5 --- .gitmodules | 4 ++-- README.md | 2 +- examples/history_data/wechat_history.py | 18 +++++++++--------- examples/wechat_history_reader_leann.py | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.gitmodules b/.gitmodules index 1899ae5..c1cd540 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "packages/leann-backend-diskann/third_party/DiskANN"] path = packages/leann-backend-diskann/third_party/DiskANN - url = https://github.com/yichuan520030910320/DiskANN.git + url = https://github.com/yichuan-w/DiskANN.git [submodule "packages/leann-backend-hnsw/third_party/faiss"] path = packages/leann-backend-hnsw/third_party/faiss - url = https://github.com/yichuan520030910320/faiss.git + url = https://github.com/yichuan-w/faiss.git [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"] path = packages/leann-backend-hnsw/third_party/msgpack-c url = https://github.com/msgpack/msgpack-c.git diff --git a/README.md b/README.md index 9da3f85..47eb52b 100755 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Traditional RAG systems often require trade-offs between storage, privacy, and u ### Installation ```bash -git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann +git clone git@github.com:yichuan-w/LEANN.git leann cd leann git submodule update --init --recursive ``` diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py index c3aee02..19320a5 100644 --- a/examples/history_data/wechat_history.py +++ b/examples/history_data/wechat_history.py @@ -197,8 +197,8 @@ class WeChatHistoryReader(BaseReader): Args: messages: List of message dictionaries - max_length: Maximum length for concatenated message groups - time_window_minutes: Time window in minutes to group messages together + max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint. + time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint. overlap_messages: Number of messages to overlap between consecutive groups Returns: @@ -230,8 +230,8 @@ class WeChatHistoryReader(BaseReader): if not readable_text.strip(): continue - # Check time window constraint - if last_timestamp is not None and create_time > 0: + # Check time window constraint (only if time_window_minutes != -1) + if time_window_minutes != -1 and last_timestamp is not None and create_time > 0: time_diff_minutes = (create_time - last_timestamp) / 60 if time_diff_minutes > time_window_minutes: # Time gap too large, start new group @@ -250,9 +250,9 @@ class WeChatHistoryReader(BaseReader): current_group = [] current_length = 0 - # Check length constraint + # Check length constraint (only if max_length != -1) message_length = len(readable_text) - if current_length + message_length > max_length and current_group: + if max_length != -1 and current_length + message_length > max_length and current_group: # Current group would exceed max length, save it and start new concatenated_groups.append({ 'messages': current_group, @@ -431,9 +431,9 @@ Contact: {contact_name} # Concatenate messages based on rules message_groups = self._concatenate_messages( readable_messages, - max_length=max_length, - time_window_minutes=time_window_minutes, - overlap_messages=2 # Keep 2 messages overlap between groups + max_length=-1, + time_window_minutes=-1, + overlap_messages=0 # Keep 2 messages overlap between groups ) # Create documents from concatenated groups diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py index 3d1147e..e15e876 100644 --- a/examples/wechat_history_reader_leann.py +++ b/examples/wechat_history_reader_leann.py @@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports( documents = reader.load_data( wechat_export_dir=str(export_dir), max_count=max_count, - concatenate_messages=True, # Disable concatenation - one message per document + concatenate_messages=False, # Disable concatenation - one message per document ) if documents: print(f"Loaded {len(documents)} chat documents from {export_dir}") @@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports( ) # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) + text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64) # Convert Documents to text strings and chunk them all_texts = [] @@ -224,7 +224,7 @@ async def query_leann_index(index_path: str, query: str): query, top_k=20, recompute_beighbor_embeddings=True, - complexity=64, + complexity=128, beam_width=1, llm_config={ "type": "openai",