upd the structure in the chat for better perf
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -84,4 +84,6 @@ test_*.py
|
|||||||
packages/leann-backend-diskann/third_party/DiskANN/_deps/
|
packages/leann-backend-diskann/third_party/DiskANN/_deps/
|
||||||
|
|
||||||
*.meta.json
|
*.meta.json
|
||||||
*.passages.json
|
*.passages.json
|
||||||
|
|
||||||
|
batchtest.py
|
||||||
@@ -152,8 +152,7 @@ python ./examples/main_cli_example.py
|
|||||||
### Search Your Entire Life
|
### Search Your Entire Life
|
||||||
```bash
|
```bash
|
||||||
python examples/mail_reader_leann.py
|
python examples/mail_reader_leann.py
|
||||||
# "What did my boss say about the Christmas party last year?"
|
# "What's the number of class recommend to take per semester for incoming EECS students?"
|
||||||
# "Find all emails from my mom about birthday plans"
|
|
||||||
```
|
```
|
||||||
**90K emails → 14MB.** Finally, search your email like you search Google.
|
**90K emails → 14MB.** Finally, search your email like you search Google.
|
||||||
|
|
||||||
@@ -191,8 +190,7 @@ Once the index is built, you can ask questions like:
|
|||||||
### Time Machine for the Web
|
### Time Machine for the Web
|
||||||
```bash
|
```bash
|
||||||
python examples/google_history_reader_leann.py
|
python examples/google_history_reader_leann.py
|
||||||
# "What was that AI paper I read last month?"
|
# "Tell me my browser history about machine learning system stuff?"
|
||||||
# "Show me all the cooking videos I watched"
|
|
||||||
```
|
```
|
||||||
**38K browser entries → 6MB.** Your browser history becomes your personal search engine.
|
**38K browser entries → 6MB.** Your browser history becomes your personal search engine.
|
||||||
|
|
||||||
|
|||||||
@@ -335,14 +335,15 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
if create_time:
|
if create_time:
|
||||||
try:
|
try:
|
||||||
timestamp = datetime.fromtimestamp(create_time)
|
timestamp = datetime.fromtimestamp(create_time)
|
||||||
time_str = timestamp.strftime('%H:%M:%S')
|
# change to YYYY-MM-DD HH:MM:SS
|
||||||
|
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||||
except:
|
except:
|
||||||
time_str = str(create_time)
|
time_str = str(create_time)
|
||||||
else:
|
else:
|
||||||
time_str = "Unknown"
|
time_str = "Unknown"
|
||||||
|
|
||||||
sender = "Me" if is_sent_from_self else "Contact"
|
sender = "[Me]" if is_sent_from_self else "[Contact]"
|
||||||
message_parts.append(f"[{time_str}] {sender}: {readable_text}")
|
message_parts.append(f"({time_str}) {sender}: {readable_text}")
|
||||||
|
|
||||||
concatenated_text = "\n".join(message_parts)
|
concatenated_text = "\n".join(message_parts)
|
||||||
|
|
||||||
@@ -354,13 +355,11 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
|||||||
|
|
||||||
{concatenated_text}
|
{concatenated_text}
|
||||||
"""
|
"""
|
||||||
|
# TODO @yichuan give better format and rich info here!
|
||||||
doc_content = f"""
|
doc_content = f"""
|
||||||
Contact: {contact_name}
|
|
||||||
|
|
||||||
{concatenated_text}
|
{concatenated_text}
|
||||||
"""
|
"""
|
||||||
return doc_content
|
return doc_content, contact_name
|
||||||
|
|
||||||
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
@@ -441,8 +440,8 @@ Contact: {contact_name}
|
|||||||
if count >= max_count and max_count > 0:
|
if count >= max_count and max_count > 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
doc_content = self._create_concatenated_content(message_group, contact_name)
|
doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
|
||||||
doc = Document(text=doc_content, metadata={})
|
doc = Document(text=doc_content, metadata={"contact_name": contact_name})
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
documents = reader.load_data(
|
documents = reader.load_data(
|
||||||
wechat_export_dir=str(export_dir),
|
wechat_export_dir=str(export_dir),
|
||||||
max_count=max_count,
|
max_count=max_count,
|
||||||
concatenate_messages=False, # Disable concatenation - one message per document
|
concatenate_messages=True, # Disable concatenation - one message per document
|
||||||
)
|
)
|
||||||
if documents:
|
if documents:
|
||||||
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
||||||
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create text splitter with 256 chunk size
|
# Create text splitter with 256 chunk size
|
||||||
text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64)
|
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||||
|
|
||||||
# Convert Documents to text strings and chunk them
|
# Convert Documents to text strings and chunk them
|
||||||
all_texts = []
|
all_texts = []
|
||||||
@@ -86,7 +86,8 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
# Split the document into chunks
|
# Split the document into chunks
|
||||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
all_texts.append(node.get_content())
|
text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
|
||||||
|
all_texts.append(text)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
|
f"Created {len(all_texts)} text chunks from {len(all_documents)} documents"
|
||||||
@@ -224,7 +225,7 @@ async def query_leann_index(index_path: str, query: str):
|
|||||||
query,
|
query,
|
||||||
top_k=20,
|
top_k=20,
|
||||||
recompute_beighbor_embeddings=True,
|
recompute_beighbor_embeddings=True,
|
||||||
complexity=32,
|
complexity=16,
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
llm_config={
|
llm_config={
|
||||||
"type": "openai",
|
"type": "openai",
|
||||||
@@ -252,13 +253,13 @@ async def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--index-dir",
|
"--index-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="./wechat_history_magic_test",
|
default="./wechat_history_magic_test_11Debug_new",
|
||||||
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
|
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-entries",
|
"--max-entries",
|
||||||
type=int,
|
type=int,
|
||||||
default=5000,
|
default=50,
|
||||||
help="Maximum number of chat entries to process (default: 5000)",
|
help="Maximum number of chat entries to process (default: 5000)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
Reference in New Issue
Block a user