update wechat and we should fix the bug introduced in 1c5fec5
This commit is contained in:
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -1,9 +1,9 @@
|
|||||||
[submodule "packages/leann-backend-diskann/third_party/DiskANN"]
|
[submodule "packages/leann-backend-diskann/third_party/DiskANN"]
|
||||||
path = packages/leann-backend-diskann/third_party/DiskANN
|
path = packages/leann-backend-diskann/third_party/DiskANN
|
||||||
url = https://github.com/yichuan520030910320/DiskANN.git
|
url = https://github.com/yichuan-w/DiskANN.git
|
||||||
[submodule "packages/leann-backend-hnsw/third_party/faiss"]
|
[submodule "packages/leann-backend-hnsw/third_party/faiss"]
|
||||||
path = packages/leann-backend-hnsw/third_party/faiss
|
path = packages/leann-backend-hnsw/third_party/faiss
|
||||||
url = https://github.com/yichuan520030910320/faiss.git
|
url = https://github.com/yichuan-w/faiss.git
|
||||||
[submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
|
[submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
|
||||||
path = packages/leann-backend-hnsw/third_party/msgpack-c
|
path = packages/leann-backend-hnsw/third_party/msgpack-c
|
||||||
url = https://github.com/msgpack/msgpack-c.git
|
url = https://github.com/msgpack/msgpack-c.git
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ Traditional RAG systems often require trade-offs between storage, privacy, and u
|
|||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann
|
git clone git@github.com:yichuan-w/LEANN.git leann
|
||||||
cd leann
|
cd leann
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -197,8 +197,8 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
messages: List of message dictionaries
|
messages: List of message dictionaries
|
||||||
max_length: Maximum length for concatenated message groups
|
max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
|
||||||
time_window_minutes: Time window in minutes to group messages together
|
time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
|
||||||
overlap_messages: Number of messages to overlap between consecutive groups
|
overlap_messages: Number of messages to overlap between consecutive groups
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -230,8 +230,8 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
if not readable_text.strip():
|
if not readable_text.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check time window constraint
|
# Check time window constraint (only if time_window_minutes != -1)
|
||||||
if last_timestamp is not None and create_time > 0:
|
if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
|
||||||
time_diff_minutes = (create_time - last_timestamp) / 60
|
time_diff_minutes = (create_time - last_timestamp) / 60
|
||||||
if time_diff_minutes > time_window_minutes:
|
if time_diff_minutes > time_window_minutes:
|
||||||
# Time gap too large, start new group
|
# Time gap too large, start new group
|
||||||
@@ -250,9 +250,9 @@ class WeChatHistoryReader(BaseReader):
|
|||||||
current_group = []
|
current_group = []
|
||||||
current_length = 0
|
current_length = 0
|
||||||
|
|
||||||
# Check length constraint
|
# Check length constraint (only if max_length != -1)
|
||||||
message_length = len(readable_text)
|
message_length = len(readable_text)
|
||||||
if current_length + message_length > max_length and current_group:
|
if max_length != -1 and current_length + message_length > max_length and current_group:
|
||||||
# Current group would exceed max length, save it and start new
|
# Current group would exceed max length, save it and start new
|
||||||
concatenated_groups.append({
|
concatenated_groups.append({
|
||||||
'messages': current_group,
|
'messages': current_group,
|
||||||
@@ -431,9 +431,9 @@ Contact: {contact_name}
|
|||||||
# Concatenate messages based on rules
|
# Concatenate messages based on rules
|
||||||
message_groups = self._concatenate_messages(
|
message_groups = self._concatenate_messages(
|
||||||
readable_messages,
|
readable_messages,
|
||||||
max_length=max_length,
|
max_length=-1,
|
||||||
time_window_minutes=time_window_minutes,
|
time_window_minutes=-1,
|
||||||
overlap_messages=2 # Keep 2 messages overlap between groups
|
overlap_messages=0 # Keep 2 messages overlap between groups
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create documents from concatenated groups
|
# Create documents from concatenated groups
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
documents = reader.load_data(
|
documents = reader.load_data(
|
||||||
wechat_export_dir=str(export_dir),
|
wechat_export_dir=str(export_dir),
|
||||||
max_count=max_count,
|
max_count=max_count,
|
||||||
concatenate_messages=True, # Disable concatenation - one message per document
|
concatenate_messages=False, # Disable concatenation - one message per document
|
||||||
)
|
)
|
||||||
if documents:
|
if documents:
|
||||||
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
print(f"Loaded {len(documents)} chat documents from {export_dir}")
|
||||||
@@ -78,7 +78,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create text splitter with 256 chunk size
|
# Create text splitter with 256 chunk size
|
||||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=64)
|
||||||
|
|
||||||
# Convert Documents to text strings and chunk them
|
# Convert Documents to text strings and chunk them
|
||||||
all_texts = []
|
all_texts = []
|
||||||
@@ -224,7 +224,7 @@ async def query_leann_index(index_path: str, query: str):
|
|||||||
query,
|
query,
|
||||||
top_k=20,
|
top_k=20,
|
||||||
recompute_beighbor_embeddings=True,
|
recompute_beighbor_embeddings=True,
|
||||||
complexity=64,
|
complexity=128,
|
||||||
beam_width=1,
|
beam_width=1,
|
||||||
llm_config={
|
llm_config={
|
||||||
"type": "openai",
|
"type": "openai",
|
||||||
|
|||||||
Reference in New Issue
Block a user