upd the structure in the chat for better perf

This commit is contained in:
yichuan520030910320
2025-07-22 17:00:56 -07:00
parent 870a443446
commit 90120d4dff
4 changed files with 20 additions and 20 deletions

View File

@@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader):
if create_time:
try:
timestamp = datetime.fromtimestamp(create_time)
time_str = timestamp.strftime('%H:%M:%S')
# change to YYYY-MM-DD HH:MM:SS
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
except:
time_str = str(create_time)
else:
time_str = "Unknown"
sender = "Me" if is_sent_from_self else "Contact"
message_parts.append(f"[{time_str}] {sender}: {readable_text}")
sender = "[Me]" if is_sent_from_self else "[Contact]"
message_parts.append(f"({time_str}) {sender}: {readable_text}")
concatenated_text = "\n".join(message_parts)
@@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
{concatenated_text}
"""
# TODO @yichuan give better format and rich info here!
doc_content = f"""
Contact: {contact_name}
{concatenated_text}
"""
return doc_content
return doc_content, contact_name
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
"""
@@ -441,8 +440,8 @@ Contact: {contact_name}
if count >= max_count and max_count > 0:
break
doc_content = self._create_concatenated_content(message_group, contact_name)
doc = Document(text=doc_content, metadata={})
doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
doc = Document(text=doc_content, metadata={"contact_name": contact_name})
docs.append(doc)
count += 1

View File

@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
documents = reader.load_data(
wechat_export_dir=str(export_dir),
max_count=max_count,
concatenate_messages=False, # Disable concatenation - one message per document
concatenate_messages=True, # Disable concatenation - one message per document
)
if documents:
print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
)
# Create text splitter with 256 chunk size
text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64)
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
# Convert Documents to text strings and chunk them
all_texts = []
@@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports(
# Split the document into chunks
nodes = text_splitter.get_nodes_from_documents([doc])
for node in nodes:
all_texts.append(node.get_content())
text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
all_texts.append(text)
print(
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
@@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str):
query,
top_k=20,
recompute_beighbor_embeddings=True,
complexity=32,
complexity=16,
beam_width=1,
llm_config={
"type": "openai",
@@ -252,13 +253,13 @@ async def main():
parser.add_argument(
"--index-dir",
type=str,
default="./wechat_history_magic_test",
default="./wechat_history_magic_test_11Debug_new",
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
)
parser.add_argument(
"--max-entries",
type=int,
default=5000,
default=50,
help="Maximum number of chat entries to process (default: 5000)",
)
parser.add_argument(