upd the structure in the chat for better perf

This commit is contained in:
yichuan520030910320
2025-07-22 17:00:56 -07:00
parent 870a443446
commit 90120d4dff
4 changed files with 20 additions and 20 deletions

4
.gitignore vendored
View File

@@ -84,4 +84,6 @@ test_*.py
packages/leann-backend-diskann/third_party/DiskANN/_deps/ packages/leann-backend-diskann/third_party/DiskANN/_deps/
*.meta.json *.meta.json
*.passages.json *.passages.json
batchtest.py

View File

@@ -152,8 +152,7 @@ python ./examples/main_cli_example.py
### Search Your Entire Life ### Search Your Entire Life
```bash ```bash
python examples/mail_reader_leann.py python examples/mail_reader_leann.py
# "What did my boss say about the Christmas party last year?" # "What's the number of class recommend to take per semester for incoming EECS students?"
# "Find all emails from my mom about birthday plans"
``` ```
**90K emails → 14MB.** Finally, search your email like you search Google. **90K emails → 14MB.** Finally, search your email like you search Google.
@@ -191,8 +190,7 @@ Once the index is built, you can ask questions like:
### Time Machine for the Web ### Time Machine for the Web
```bash ```bash
python examples/google_history_reader_leann.py python examples/google_history_reader_leann.py
# "What was that AI paper I read last month?" # "Tell me my browser history about machine learning system stuff?"
# "Show me all the cooking videos I watched"
``` ```
**38K browser entries → 6MB.** Your browser history becomes your personal search engine. **38K browser entries → 6MB.** Your browser history becomes your personal search engine.

View File

@@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader):
if create_time: if create_time:
try: try:
timestamp = datetime.fromtimestamp(create_time) timestamp = datetime.fromtimestamp(create_time)
time_str = timestamp.strftime('%H:%M:%S') # change to YYYY-MM-DD HH:MM:SS
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
except: except:
time_str = str(create_time) time_str = str(create_time)
else: else:
time_str = "Unknown" time_str = "Unknown"
sender = "Me" if is_sent_from_self else "Contact" sender = "[Me]" if is_sent_from_self else "[Contact]"
message_parts.append(f"[{time_str}] {sender}: {readable_text}") message_parts.append(f"({time_str}) {sender}: {readable_text}")
concatenated_text = "\n".join(message_parts) concatenated_text = "\n".join(message_parts)
@@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
{concatenated_text} {concatenated_text}
""" """
# TODO @yichuan give better format and rich info here!
doc_content = f""" doc_content = f"""
Contact: {contact_name}
{concatenated_text} {concatenated_text}
""" """
return doc_content return doc_content, contact_name
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]: def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
""" """
@@ -441,8 +440,8 @@ Contact: {contact_name}
if count >= max_count and max_count > 0: if count >= max_count and max_count > 0:
break break
doc_content = self._create_concatenated_content(message_group, contact_name) doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
doc = Document(text=doc_content, metadata={}) doc = Document(text=doc_content, metadata={"contact_name": contact_name})
docs.append(doc) docs.append(doc)
count += 1 count += 1

View File

@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
documents = reader.load_data( documents = reader.load_data(
wechat_export_dir=str(export_dir), wechat_export_dir=str(export_dir),
max_count=max_count, max_count=max_count,
concatenate_messages=False, # Disable concatenation - one message per document concatenate_messages=True, # Disable concatenation - one message per document
) )
if documents: if documents:
print(f"Loaded {len(documents)} chat documents from {export_dir}") print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
) )
# Create text splitter with 256 chunk size # Create text splitter with 256 chunk size
text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64) text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
# Convert Documents to text strings and chunk them # Convert Documents to text strings and chunk them
all_texts = [] all_texts = []
@@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports(
# Split the document into chunks # Split the document into chunks
nodes = text_splitter.get_nodes_from_documents([doc]) nodes = text_splitter.get_nodes_from_documents([doc])
for node in nodes: for node in nodes:
all_texts.append(node.get_content()) text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
all_texts.append(text)
print( print(
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents" f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
@@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str):
query, query,
top_k=20, top_k=20,
recompute_beighbor_embeddings=True, recompute_beighbor_embeddings=True,
complexity=32, complexity=16,
beam_width=1, beam_width=1,
llm_config={ llm_config={
"type": "openai", "type": "openai",
@@ -252,13 +253,13 @@ async def main():
parser.add_argument( parser.add_argument(
"--index-dir", "--index-dir",
type=str, type=str,
default="./wechat_history_magic_test", default="./wechat_history_magic_test_11Debug_new",
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
) )
parser.add_argument( parser.add_argument(
"--max-entries", "--max-entries",
type=int, type=int,
default=5000, default=50,
help="Maximum number of chat entries to process (default: 5000)", help="Maximum number of chat entries to process (default: 5000)",
) )
parser.add_argument( parser.add_argument(