From f8d34663b4044b118a0896f0a2244f5e9b0ee1d6 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 3 Aug 2025 21:41:53 -0700 Subject: [PATCH 1/2] feat: check if k is larger than #docs --- packages/leann-core/src/leann/api.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 9efefde..7c3aaee 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -493,6 +493,16 @@ class LeannSearcher: logger.info(f" Top_k: {top_k}") logger.info(f" Additional kwargs: {kwargs}") + # Smart top_k detection and adjustment + total_docs = len(self.passage_manager.global_offset_map) + original_top_k = top_k + if top_k > total_docs: + top_k = total_docs + logger.warning( + f" ⚠️ Requested top_k ({original_top_k}) exceeds total documents ({total_docs})" + ) + logger.warning(f" ✅ Auto-adjusted top_k to {top_k} to match available documents") + zmq_port = None start_time = time.time() From 86f919a6da9a2e808b462557cd773d3e7ee48206 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 3 Aug 2025 21:54:25 -0700 Subject: [PATCH 2/2] fix: WeChat history reader bugs and refactor wechat_rag to use unified architecture --- examples/history_data/wechat_history.py | 16 ++-- examples/wechat_rag.py | 102 +++++++++++++++--------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py index 4106321..e985bd4 100644 --- a/examples/history_data/wechat_history.py +++ b/examples/history_data/wechat_history.py @@ -411,8 +411,8 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars): wechat_export_dir = load_kwargs.get("wechat_export_dir", None) include_non_text = load_kwargs.get("include_non_text", False) concatenate_messages = load_kwargs.get("concatenate_messages", False) - load_kwargs.get("max_length", 1000) - load_kwargs.get("time_window_minutes", 30) + max_length = load_kwargs.get("max_length", 1000) + time_window_minutes = load_kwargs.get("time_window_minutes", 30) # Default WeChat export path if wechat_export_dir is None: @@ -460,9 +460,9 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars): # Concatenate messages based on rules message_groups = self._concatenate_messages( readable_messages, - max_length=-1, - time_window_minutes=-1, - overlap_messages=0, # Keep 2 messages overlap between groups + max_length=max_length, + time_window_minutes=time_window_minutes, + overlap_messages=0, # No overlap between groups ) # Create documents from concatenated groups @@ -532,7 +532,9 @@ Message: {readable_text if readable_text else message_text} """ # Create document with embedded metadata - doc = Document(text=doc_content, metadata={}) + doc = Document( + text=doc_content, metadata={"contact_name": contact_name} + ) docs.append(doc) count += 1 @@ -560,8 +562,8 @@ Message: {readable_text if readable_text else message_text} # Look for common export directory names possible_dirs = [ - Path("./wechat_export_test"), Path("./wechat_export"), + Path("./wechat_export_direct"), Path("./wechat_chat_history"), Path("./chat_export"), ] diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py index a071f89..f127f3f 100644 --- a/examples/wechat_rag.py +++ b/examples/wechat_rag.py @@ -10,7 +10,7 @@ from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) -from base_rag_example import BaseRAGExample, create_text_chunks +from base_rag_example import BaseRAGExample from history_data.wechat_history import WeChatHistoryReader @@ -92,52 +92,78 @@ class WeChatRAG(BaseRAGExample): async def load_data(self, args) -> list[str]: """Load WeChat history and convert to text chunks.""" - export_path = Path(args.export_dir) - - # Check if we need to export - need_export = ( - args.force_export or not export_path.exists() or not any(export_path.iterdir()) - ) - - if need_export: - if sys.platform != "darwin": - print("\n⚠️ Error: WeChat export is only supported on macOS") - return [] - - success = self._export_wechat_data(export_path) - if not success: - print("Failed to export WeChat data") - return [] - else: - print(f"Using existing WeChat export: {export_path}") - - # Load WeChat data + # Initialize WeChat reader with export capabilities reader = WeChatHistoryReader() - try: - print("\nLoading WeChat history...") - documents = reader.load_data( - wechat_export_dir=str(export_path), - max_count=args.max_items if args.max_items > 0 else -1, - ) - - if not documents: - print("No WeChat data found!") + # Find existing exports or create new ones using the centralized method + export_dirs = reader.find_or_export_wechat_data(args.export_dir) + if not export_dirs: + print("Failed to find or export WeChat data. Trying to find any existing exports...") + # Try to find any existing exports in common locations + export_dirs = reader.find_wechat_export_dirs() + if not export_dirs: + print("No WeChat data found. Please ensure WeChat exports exist.") return [] - print(f"Loaded {len(documents)} chat entries") + # Load documents from all found export directories + all_documents = [] + total_processed = 0 - # Convert to text chunks - all_texts = create_text_chunks( - documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap - ) + for i, export_dir in enumerate(export_dirs): + print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}") - return all_texts + try: + # Apply max_items limit per export + max_per_export = -1 + if args.max_items > 0: + remaining = args.max_items - total_processed + if remaining <= 0: + break + max_per_export = remaining - except Exception as e: - print(f"Error loading WeChat data: {e}") + documents = reader.load_data( + wechat_export_dir=str(export_dir), + max_count=max_per_export, + concatenate_messages=True, # Enable message concatenation for better context + ) + + if documents: + print(f"Loaded {len(documents)} chat documents from {export_dir}") + all_documents.extend(documents) + total_processed += len(documents) + else: + print(f"No documents loaded from {export_dir}") + + except Exception as e: + print(f"Error processing {export_dir}: {e}") + continue + + if not all_documents: + print("No documents loaded from any source. Exiting.") return [] + print(f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports") + + # Convert to text chunks with contact information + all_texts = [] + for doc in all_documents: + # Split the document into chunks + from llama_index.core.node_parser import SentenceSplitter + + text_splitter = SentenceSplitter( + chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + nodes = text_splitter.get_nodes_from_documents([doc]) + + for node in nodes: + # Add contact information to each chunk + contact_name = doc.metadata.get("contact_name", "Unknown") + text = f"[Contact] means the message is from: {contact_name}\n" + node.get_content() + all_texts.append(text) + + print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") + return all_texts + if __name__ == "__main__": import asyncio