upd the structure in the chat for better perf

2025-07-22 17:00:56 -07:00
parent 870a443446
commit 90120d4dff
4 changed files with 20 additions and 20 deletions
@@ -84,4 +84,6 @@ test_*.py
 packages/leann-backend-diskann/third_party/DiskANN/_deps/

 *.meta.json
-*.passages.json
+*.passages.json
+
+batchtest.py
@@ -152,8 +152,7 @@ python ./examples/main_cli_example.py
 ### Search Your Entire Life
 ```bash
 python examples/mail_reader_leann.py
-# "What did my boss say about the Christmas party last year?"
-# "Find all emails from my mom about birthday plans"
+# "What's the number of class recommend to take per semester for incoming EECS students?"
 ```
 **90K emails → 14MB.** Finally, search your email like you search Google.

@@ -191,8 +190,7 @@ Once the index is built, you can ask questions like:
 ### Time Machine for the Web  
 ```bash
 python examples/google_history_reader_leann.py
-# "What was that AI paper I read last month?"
-# "Show me all the cooking videos I watched"
+# "Tell me my browser history about machine learning system stuff?"
 ```
 **38K browser entries → 6MB.** Your browser history becomes your personal search engine.

@@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader):
            if create_time:
                try:
                    timestamp = datetime.fromtimestamp(create_time)
-                    time_str = timestamp.strftime('%H:%M:%S')
+                    # change to YYYY-MM-DD HH:MM:SS
+                    time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                except:
                    time_str = str(create_time)
            else:
                time_str = "Unknown"
            
-            sender = "Me" if is_sent_from_self else "Contact"
-            message_parts.append(f"[{time_str}] {sender}: {readable_text}")
+            sender = "[Me]" if is_sent_from_self else "[Contact]"
+            message_parts.append(f"({time_str}) {sender}: {readable_text}")
        
        concatenated_text = "\n".join(message_parts)
        
@@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):

 {concatenated_text}
 """
-        
+        # TODO @yichuan give better format and rich info here!    
        doc_content = f"""
-Contact: {contact_name}
-
 {concatenated_text}
 """
-        return doc_content
+        return doc_content, contact_name
    
    def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
        """
@@ -441,8 +440,8 @@ Contact: {contact_name}
                            if count >= max_count and max_count > 0:
                                break
                            
-                            doc_content = self._create_concatenated_content(message_group, contact_name)
-                            doc = Document(text=doc_content, metadata={})
+                            doc_content, contact_name  = self._create_concatenated_content(message_group, contact_name)
+                            doc = Document(text=doc_content, metadata={"contact_name": contact_name})
                            docs.append(doc)
                            count += 1
                        
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
                documents = reader.load_data(
                    wechat_export_dir=str(export_dir),
                    max_count=max_count,
-                    concatenate_messages=False,  # Disable concatenation - one message per document
+                    concatenate_messages=True,  # Disable concatenation - one message per document
                )
                if documents:
                    print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
        )

        # Create text splitter with 256 chunk size
-        text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64)
+        text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)

        # Convert Documents to text strings and chunk them
        all_texts = []
@@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports(
            # Split the document into chunks
            nodes = text_splitter.get_nodes_from_documents([doc])
            for node in nodes:
-                all_texts.append(node.get_content())
+                text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
+                all_texts.append(text)

        print(
            f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
@@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str):
        query,
        top_k=20,
        recompute_beighbor_embeddings=True,
-        complexity=32,
+        complexity=16,
        beam_width=1,
        llm_config={
            "type": "openai",
@@ -252,13 +253,13 @@ async def main():
    parser.add_argument(
        "--index-dir",
        type=str,
-        default="./wechat_history_magic_test",
+        default="./wechat_history_magic_test_11Debug_new",
        help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
    )
    parser.add_argument(
        "--max-entries",
        type=int,
-        default=5000,
+        default=50,
        help="Maximum number of chat entries to process (default: 5000)",
    )
    parser.add_argument(