From f8d34663b4044b118a0896f0a2244f5e9b0ee1d6 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Sun, 3 Aug 2025 21:41:53 -0700
Subject: [PATCH 1/2] feat: check if k is larger than #docs

---
 packages/leann-core/src/leann/api.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index 9efefde..7c3aaee 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -493,6 +493,16 @@ class LeannSearcher:
         logger.info(f"  Top_k: {top_k}")
         logger.info(f"  Additional kwargs: {kwargs}")
 
+        # Smart top_k detection and adjustment
+        total_docs = len(self.passage_manager.global_offset_map)
+        original_top_k = top_k
+        if top_k > total_docs:
+            top_k = total_docs
+            logger.warning(
+                f"  ⚠️  Requested top_k ({original_top_k}) exceeds total documents ({total_docs})"
+            )
+            logger.warning(f"  ✅ Auto-adjusted top_k to {top_k} to match available documents")
+
         zmq_port = None
 
         start_time = time.time()

From 86f919a6da9a2e808b462557cd773d3e7ee48206 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Sun, 3 Aug 2025 21:54:25 -0700
Subject: [PATCH 2/2] fix: WeChat history reader bugs and refactor wechat_rag
 to use unified architecture

---
 examples/history_data/wechat_history.py |  16 ++--
 examples/wechat_rag.py                  | 102 +++++++++++++++---------
 2 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py
index 4106321..e985bd4 100644
--- a/examples/history_data/wechat_history.py
+++ b/examples/history_data/wechat_history.py
@@ -411,8 +411,8 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
         wechat_export_dir = load_kwargs.get("wechat_export_dir", None)
         include_non_text = load_kwargs.get("include_non_text", False)
         concatenate_messages = load_kwargs.get("concatenate_messages", False)
-        load_kwargs.get("max_length", 1000)
-        load_kwargs.get("time_window_minutes", 30)
+        max_length = load_kwargs.get("max_length", 1000)
+        time_window_minutes = load_kwargs.get("time_window_minutes", 30)
 
         # Default WeChat export path
         if wechat_export_dir is None:
@@ -460,9 +460,9 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
                         # Concatenate messages based on rules
                         message_groups = self._concatenate_messages(
                             readable_messages,
-                            max_length=-1,
-                            time_window_minutes=-1,
-                            overlap_messages=0,  # Keep 2 messages overlap between groups
+                            max_length=max_length,
+                            time_window_minutes=time_window_minutes,
+                            overlap_messages=0,  # No overlap between groups
                         )
 
                         # Create documents from concatenated groups
@@ -532,7 +532,9 @@ Message: {readable_text if readable_text else message_text}
 """
 
                             # Create document with embedded metadata
-                            doc = Document(text=doc_content, metadata={})
+                            doc = Document(
+                                text=doc_content, metadata={"contact_name": contact_name}
+                            )
                             docs.append(doc)
                             count += 1
 
@@ -560,8 +562,8 @@ Message: {readable_text if readable_text else message_text}
 
         # Look for common export directory names
         possible_dirs = [
-            Path("./wechat_export_test"),
             Path("./wechat_export"),
+            Path("./wechat_export_direct"),
             Path("./wechat_chat_history"),
             Path("./chat_export"),
         ]
diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py
index a071f89..f127f3f 100644
--- a/examples/wechat_rag.py
+++ b/examples/wechat_rag.py
@@ -10,7 +10,7 @@ from pathlib import Path
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
 
-from base_rag_example import BaseRAGExample, create_text_chunks
+from base_rag_example import BaseRAGExample
 from history_data.wechat_history import WeChatHistoryReader
 
 
@@ -92,52 +92,78 @@ class WeChatRAG(BaseRAGExample):
 
     async def load_data(self, args) -> list[str]:
         """Load WeChat history and convert to text chunks."""
-        export_path = Path(args.export_dir)
-
-        # Check if we need to export
-        need_export = (
-            args.force_export or not export_path.exists() or not any(export_path.iterdir())
-        )
-
-        if need_export:
-            if sys.platform != "darwin":
-                print("\n⚠️  Error: WeChat export is only supported on macOS")
-                return []
-
-            success = self._export_wechat_data(export_path)
-            if not success:
-                print("Failed to export WeChat data")
-                return []
-        else:
-            print(f"Using existing WeChat export: {export_path}")
-
-        # Load WeChat data
+        # Initialize WeChat reader with export capabilities
         reader = WeChatHistoryReader()
 
-        try:
-            print("\nLoading WeChat history...")
-            documents = reader.load_data(
-                wechat_export_dir=str(export_path),
-                max_count=args.max_items if args.max_items > 0 else -1,
-            )
-
-            if not documents:
-                print("No WeChat data found!")
+        # Find existing exports or create new ones using the centralized method
+        export_dirs = reader.find_or_export_wechat_data(args.export_dir)
+        if not export_dirs:
+            print("Failed to find or export WeChat data. Trying to find any existing exports...")
+            # Try to find any existing exports in common locations
+            export_dirs = reader.find_wechat_export_dirs()
+            if not export_dirs:
+                print("No WeChat data found. Please ensure WeChat exports exist.")
                 return []
 
-            print(f"Loaded {len(documents)} chat entries")
+        # Load documents from all found export directories
+        all_documents = []
+        total_processed = 0
 
-            # Convert to text chunks
-            all_texts = create_text_chunks(
-                documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
-            )
+        for i, export_dir in enumerate(export_dirs):
+            print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}")
 
-            return all_texts
+            try:
+                # Apply max_items limit per export
+                max_per_export = -1
+                if args.max_items > 0:
+                    remaining = args.max_items - total_processed
+                    if remaining <= 0:
+                        break
+                    max_per_export = remaining
 
-        except Exception as e:
-            print(f"Error loading WeChat data: {e}")
+                documents = reader.load_data(
+                    wechat_export_dir=str(export_dir),
+                    max_count=max_per_export,
+                    concatenate_messages=True,  # Enable message concatenation for better context
+                )
+
+                if documents:
+                    print(f"Loaded {len(documents)} chat documents from {export_dir}")
+                    all_documents.extend(documents)
+                    total_processed += len(documents)
+                else:
+                    print(f"No documents loaded from {export_dir}")
+
+            except Exception as e:
+                print(f"Error processing {export_dir}: {e}")
+                continue
+
+        if not all_documents:
+            print("No documents loaded from any source. Exiting.")
             return []
 
+        print(f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports")
+
+        # Convert to text chunks with contact information
+        all_texts = []
+        for doc in all_documents:
+            # Split the document into chunks
+            from llama_index.core.node_parser import SentenceSplitter
+
+            text_splitter = SentenceSplitter(
+                chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
+            )
+            nodes = text_splitter.get_nodes_from_documents([doc])
+
+            for node in nodes:
+                # Add contact information to each chunk
+                contact_name = doc.metadata.get("contact_name", "Unknown")
+                text = f"[Contact] means the message is from: {contact_name}\n" + node.get_content()
+                all_texts.append(text)
+
+        print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
+        return all_texts
+
 
 if __name__ == "__main__":
     import asyncio