From 90120d4dff6d6000cf17507e49a3ff117eea43f3 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Tue, 22 Jul 2025 17:00:56 -0700 Subject: [PATCH] upd the structure in the chat for better perf --- .gitignore | 4 +++- README.md | 6 ++---- examples/history_data/wechat_history.py | 17 ++++++++--------- examples/wechat_history_reader_leann.py | 13 +++++++------ 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index ea88898..9d37f9a 100755 --- a/.gitignore +++ b/.gitignore @@ -84,4 +84,6 @@ test_*.py packages/leann-backend-diskann/third_party/DiskANN/_deps/ *.meta.json -*.passages.json \ No newline at end of file +*.passages.json + +batchtest.py \ No newline at end of file diff --git a/README.md b/README.md index fccd5aa..040a109 100755 --- a/README.md +++ b/README.md @@ -152,8 +152,7 @@ python ./examples/main_cli_example.py ### Search Your Entire Life ```bash python examples/mail_reader_leann.py -# "What did my boss say about the Christmas party last year?" -# "Find all emails from my mom about birthday plans" +# "What's the number of class recommend to take per semester for incoming EECS students?" ``` **90K emails → 14MB.** Finally, search your email like you search Google. @@ -191,8 +190,7 @@ Once the index is built, you can ask questions like: ### Time Machine for the Web ```bash python examples/google_history_reader_leann.py -# "What was that AI paper I read last month?" -# "Show me all the cooking videos I watched" +# "Tell me my browser history about machine learning system stuff?" ``` **38K browser entries → 6MB.** Your browser history becomes your personal search engine. diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py index 19320a5..7524dcb 100644 --- a/examples/history_data/wechat_history.py +++ b/examples/history_data/wechat_history.py @@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader): if create_time: try: timestamp = datetime.fromtimestamp(create_time) - time_str = timestamp.strftime('%H:%M:%S') + # change to YYYY-MM-DD HH:MM:SS + time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S') except: time_str = str(create_time) else: time_str = "Unknown" - sender = "Me" if is_sent_from_self else "Contact" - message_parts.append(f"[{time_str}] {sender}: {readable_text}") + sender = "[Me]" if is_sent_from_self else "[Contact]" + message_parts.append(f"({time_str}) {sender}: {readable_text}") concatenated_text = "\n".join(message_parts) @@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars): {concatenated_text} """ - + # TODO @yichuan give better format and rich info here! doc_content = f""" -Contact: {contact_name} - {concatenated_text} """ - return doc_content + return doc_content, contact_name def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]: """ @@ -441,8 +440,8 @@ Contact: {contact_name} if count >= max_count and max_count > 0: break - doc_content = self._create_concatenated_content(message_group, contact_name) - doc = Document(text=doc_content, metadata={}) + doc_content, contact_name = self._create_concatenated_content(message_group, contact_name) + doc = Document(text=doc_content, metadata={"contact_name": contact_name}) docs.append(doc) count += 1 diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py index 49e04a1..624b596 100644 --- a/examples/wechat_history_reader_leann.py +++ b/examples/wechat_history_reader_leann.py @@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports( documents = reader.load_data( wechat_export_dir=str(export_dir), max_count=max_count, - concatenate_messages=False, # Disable concatenation - one message per document + concatenate_messages=True, # Disable concatenation - one message per document ) if documents: print(f"Loaded {len(documents)} chat documents from {export_dir}") @@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports( ) # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64) + text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) # Convert Documents to text strings and chunk them all_texts = [] @@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports( # Split the document into chunks nodes = text_splitter.get_nodes_from_documents([doc]) for node in nodes: - all_texts.append(node.get_content()) + text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content() + all_texts.append(text) print( f"Created {len(all_texts)} text chunks from {len(all_documents)} documents" @@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str): query, top_k=20, recompute_beighbor_embeddings=True, - complexity=32, + complexity=16, beam_width=1, llm_config={ "type": "openai", @@ -252,13 +253,13 @@ async def main(): parser.add_argument( "--index-dir", type=str, - default="./wechat_history_magic_test", + default="./wechat_history_magic_test_11Debug_new", help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", ) parser.add_argument( "--max-entries", type=int, - default=5000, + default=50, help="Maximum number of chat entries to process (default: 5000)", ) parser.add_argument(