merge
This commit is contained in:
@@ -411,8 +411,8 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
|||||||
wechat_export_dir = load_kwargs.get("wechat_export_dir", None)
|
wechat_export_dir = load_kwargs.get("wechat_export_dir", None)
|
||||||
include_non_text = load_kwargs.get("include_non_text", False)
|
include_non_text = load_kwargs.get("include_non_text", False)
|
||||||
concatenate_messages = load_kwargs.get("concatenate_messages", False)
|
concatenate_messages = load_kwargs.get("concatenate_messages", False)
|
||||||
load_kwargs.get("max_length", 1000)
|
max_length = load_kwargs.get("max_length", 1000)
|
||||||
load_kwargs.get("time_window_minutes", 30)
|
time_window_minutes = load_kwargs.get("time_window_minutes", 30)
|
||||||
|
|
||||||
# Default WeChat export path
|
# Default WeChat export path
|
||||||
if wechat_export_dir is None:
|
if wechat_export_dir is None:
|
||||||
@@ -460,9 +460,9 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
|||||||
# Concatenate messages based on rules
|
# Concatenate messages based on rules
|
||||||
message_groups = self._concatenate_messages(
|
message_groups = self._concatenate_messages(
|
||||||
readable_messages,
|
readable_messages,
|
||||||
max_length=-1,
|
max_length=max_length,
|
||||||
time_window_minutes=-1,
|
time_window_minutes=time_window_minutes,
|
||||||
overlap_messages=0, # Keep 2 messages overlap between groups
|
overlap_messages=0, # No overlap between groups
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create documents from concatenated groups
|
# Create documents from concatenated groups
|
||||||
@@ -532,7 +532,9 @@ Message: {readable_text if readable_text else message_text}
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Create document with embedded metadata
|
# Create document with embedded metadata
|
||||||
doc = Document(text=doc_content, metadata={})
|
doc = Document(
|
||||||
|
text=doc_content, metadata={"contact_name": contact_name}
|
||||||
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
@@ -560,8 +562,8 @@ Message: {readable_text if readable_text else message_text}
|
|||||||
|
|
||||||
# Look for common export directory names
|
# Look for common export directory names
|
||||||
possible_dirs = [
|
possible_dirs = [
|
||||||
Path("./wechat_export_test"),
|
|
||||||
Path("./wechat_export"),
|
Path("./wechat_export"),
|
||||||
|
Path("./wechat_export_direct"),
|
||||||
Path("./wechat_chat_history"),
|
Path("./wechat_chat_history"),
|
||||||
Path("./chat_export"),
|
Path("./chat_export"),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from pathlib import Path
|
|||||||
# Add parent directory to path for imports
|
# Add parent directory to path for imports
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
from base_rag_example import BaseRAGExample, create_text_chunks
|
from base_rag_example import BaseRAGExample
|
||||||
from history_data.wechat_history import WeChatHistoryReader
|
from history_data.wechat_history import WeChatHistoryReader
|
||||||
|
|
||||||
|
|
||||||
@@ -92,56 +92,78 @@ class WeChatRAG(BaseRAGExample):
|
|||||||
|
|
||||||
async def load_data(self, args) -> list[str]:
|
async def load_data(self, args) -> list[str]:
|
||||||
"""Load WeChat history and convert to text chunks."""
|
"""Load WeChat history and convert to text chunks."""
|
||||||
export_path = Path(args.export_dir)
|
# Initialize WeChat reader with export capabilities
|
||||||
|
|
||||||
# Check if we need to export
|
|
||||||
need_export = (
|
|
||||||
args.force_export or not export_path.exists() or not any(export_path.iterdir())
|
|
||||||
)
|
|
||||||
|
|
||||||
if need_export:
|
|
||||||
if sys.platform != "darwin":
|
|
||||||
print("\n⚠️ Error: WeChat export is only supported on macOS")
|
|
||||||
return []
|
|
||||||
|
|
||||||
success = self._export_wechat_data(export_path)
|
|
||||||
if not success:
|
|
||||||
print("Failed to export WeChat data")
|
|
||||||
# add red: you may nned to restart your wechat to make the database file available
|
|
||||||
print(
|
|
||||||
"\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m"
|
|
||||||
)
|
|
||||||
return []
|
|
||||||
else:
|
|
||||||
print(f"Using existing WeChat export: {export_path}")
|
|
||||||
|
|
||||||
# Load WeChat data
|
|
||||||
reader = WeChatHistoryReader()
|
reader = WeChatHistoryReader()
|
||||||
|
|
||||||
try:
|
# Find existing exports or create new ones using the centralized method
|
||||||
print("\nLoading WeChat history...")
|
export_dirs = reader.find_or_export_wechat_data(args.export_dir)
|
||||||
documents = reader.load_data(
|
if not export_dirs:
|
||||||
wechat_export_dir=str(export_path),
|
print("Failed to find or export WeChat data. Trying to find any existing exports...")
|
||||||
max_count=args.max_items if args.max_items > 0 else -1,
|
# Try to find any existing exports in common locations
|
||||||
)
|
export_dirs = reader.find_wechat_export_dirs()
|
||||||
|
if not export_dirs:
|
||||||
if not documents:
|
print("No WeChat data found. Please ensure WeChat exports exist.")
|
||||||
print("No WeChat data found!")
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
print(f"Loaded {len(documents)} chat entries")
|
# Load documents from all found export directories
|
||||||
|
all_documents = []
|
||||||
|
total_processed = 0
|
||||||
|
|
||||||
# Convert to text chunks
|
for i, export_dir in enumerate(export_dirs):
|
||||||
all_texts = create_text_chunks(
|
print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}")
|
||||||
documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
|
|
||||||
)
|
|
||||||
|
|
||||||
return all_texts
|
try:
|
||||||
|
# Apply max_items limit per export
|
||||||
|
max_per_export = -1
|
||||||
|
if args.max_items > 0:
|
||||||
|
remaining = args.max_items - total_processed
|
||||||
|
if remaining <= 0:
|
||||||
|
break
|
||||||
|
max_per_export = remaining
|
||||||
|
|
||||||
except Exception as e:
|
documents = reader.load_data(
|
||||||
print(f"Error loading WeChat data: {e}")
|
wechat_export_dir=str(export_dir),
|
||||||
|
max_count=max_per_export,
|
||||||
|
concatenate_messages=True, # Enable message concatenation for better context
|
||||||
|
)
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
||||||
|
all_documents.extend(documents)
|
||||||
|
total_processed += len(documents)
|
||||||
|
else:
|
||||||
|
print(f"No documents loaded from {export_dir}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {export_dir}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not all_documents:
|
||||||
|
print("No documents loaded from any source. Exiting.")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
print(f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports")
|
||||||
|
|
||||||
|
# Convert to text chunks with contact information
|
||||||
|
all_texts = []
|
||||||
|
for doc in all_documents:
|
||||||
|
# Split the document into chunks
|
||||||
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
|
|
||||||
|
text_splitter = SentenceSplitter(
|
||||||
|
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap
|
||||||
|
)
|
||||||
|
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
# Add contact information to each chunk
|
||||||
|
contact_name = doc.metadata.get("contact_name", "Unknown")
|
||||||
|
text = f"[Contact] means the message is from: {contact_name}\n" + node.get_content()
|
||||||
|
all_texts.append(text)
|
||||||
|
|
||||||
|
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
|
||||||
|
return all_texts
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|||||||
@@ -499,6 +499,16 @@ class LeannSearcher:
|
|||||||
logger.info(f" Top_k: {top_k}")
|
logger.info(f" Top_k: {top_k}")
|
||||||
logger.info(f" Additional kwargs: {kwargs}")
|
logger.info(f" Additional kwargs: {kwargs}")
|
||||||
|
|
||||||
|
# Smart top_k detection and adjustment
|
||||||
|
total_docs = len(self.passage_manager.global_offset_map)
|
||||||
|
original_top_k = top_k
|
||||||
|
if top_k > total_docs:
|
||||||
|
top_k = total_docs
|
||||||
|
logger.warning(
|
||||||
|
f" ⚠️ Requested top_k ({original_top_k}) exceeds total documents ({total_docs})"
|
||||||
|
)
|
||||||
|
logger.warning(f" ✅ Auto-adjusted top_k to {top_k} to match available documents")
|
||||||
|
|
||||||
zmq_port = None
|
zmq_port = None
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|||||||
Reference in New Issue
Block a user