From 90120d4dff6d6000cf17507e49a3ff117eea43f3 Mon Sep 17 00:00:00 2001
From: yichuan520030910320 <yichuan_wang@berkeley.edu>
Date: Tue, 22 Jul 2025 17:00:56 -0700
Subject: [PATCH] upd the structure in the chat for better perf

---
 .gitignore                              |  4 +++-
 README.md                               |  6 ++----
 examples/history_data/wechat_history.py | 17 ++++++++---------
 examples/wechat_history_reader_leann.py | 13 +++++++------
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index ea88898..9d37f9a 100755
--- a/.gitignore
+++ b/.gitignore
@@ -84,4 +84,6 @@ test_*.py
 packages/leann-backend-diskann/third_party/DiskANN/_deps/
 
 *.meta.json
-*.passages.json
\ No newline at end of file
+*.passages.json
+
+batchtest.py
\ No newline at end of file
diff --git a/README.md b/README.md
index fccd5aa..040a109 100755
--- a/README.md
+++ b/README.md
@@ -152,8 +152,7 @@ python ./examples/main_cli_example.py
 ### Search Your Entire Life
 ```bash
 python examples/mail_reader_leann.py
-# "What did my boss say about the Christmas party last year?"
-# "Find all emails from my mom about birthday plans"
+# "What's the number of class recommend to take per semester for incoming EECS students?"
 ```
 **90K emails → 14MB.** Finally, search your email like you search Google.
 
@@ -191,8 +190,7 @@ Once the index is built, you can ask questions like:
 ### Time Machine for the Web  
 ```bash
 python examples/google_history_reader_leann.py
-# "What was that AI paper I read last month?"
-# "Show me all the cooking videos I watched"
+# "Tell me my browser history about machine learning system stuff?"
 ```
 **38K browser entries → 6MB.** Your browser history becomes your personal search engine.
 
diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py
index 19320a5..7524dcb 100644
--- a/examples/history_data/wechat_history.py
+++ b/examples/history_data/wechat_history.py
@@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader):
             if create_time:
                 try:
                     timestamp = datetime.fromtimestamp(create_time)
-                    time_str = timestamp.strftime('%H:%M:%S')
+                    # change to YYYY-MM-DD HH:MM:SS
+                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                 except:
                     time_str = str(create_time)
             else:
                 time_str = "Unknown"
             
-            sender = "Me" if is_sent_from_self else "Contact"
-            message_parts.append(f"[{time_str}] {sender}: {readable_text}")
+            sender = "[Me]" if is_sent_from_self else "[Contact]"
+            message_parts.append(f"({time_str}) {sender}: {readable_text}")
         
         concatenated_text = "\n".join(message_parts)
         
@@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
 
 {concatenated_text}
 """
-        
+        # TODO @yichuan give better format and rich info here!    
         doc_content = f"""
-Contact: {contact_name}
-
 {concatenated_text}
 """
-        return doc_content
+        return doc_content, contact_name
     
     def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
         """
@@ -441,8 +440,8 @@ Contact: {contact_name}
                             if count >= max_count and max_count > 0:
                                 break
                             
-                            doc_content = self._create_concatenated_content(message_group, contact_name)
-                            doc = Document(text=doc_content, metadata={})
+                            doc_content, contact_name  = self._create_concatenated_content(message_group, contact_name)
+                            doc = Document(text=doc_content, metadata={"contact_name": contact_name})
                             docs.append(doc)
                             count += 1
                         
diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py
index 49e04a1..624b596 100644
--- a/examples/wechat_history_reader_leann.py
+++ b/examples/wechat_history_reader_leann.py
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
                 documents = reader.load_data(
                     wechat_export_dir=str(export_dir),
                     max_count=max_count,
-                    concatenate_messages=False,  # Disable concatenation - one message per document
+                    concatenate_messages=True,  # Disable concatenation - one message per document
                 )
                 if documents:
                     print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
         )
 
         # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64)
+        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
 
         # Convert Documents to text strings and chunk them
         all_texts = []
@@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports(
             # Split the document into chunks
             nodes = text_splitter.get_nodes_from_documents([doc])
             for node in nodes:
-                all_texts.append(node.get_content())
+                text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
+                all_texts.append(text)
 
         print(
             f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
@@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str):
         query,
         top_k=20,
         recompute_beighbor_embeddings=True,
-        complexity=32,
+        complexity=16,
         beam_width=1,
         llm_config={
             "type": "openai",
@@ -252,13 +253,13 @@ async def main():
     parser.add_argument(
         "--index-dir",
         type=str,
-        default="./wechat_history_magic_test",
+        default="./wechat_history_magic_test_11Debug_new",
         help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
     )
     parser.add_argument(
         "--max-entries",
         type=int,
-        default=5000,
+        default=50,
         help="Maximum number of chat entries to process (default: 5000)",
     )
     parser.add_argument(