change wecaht app split logic

This commit is contained in:
yichuan520030910320
2025-07-19 19:43:30 -07:00
parent e117743d24
commit 0796a52df1
9 changed files with 112 additions and 54 deletions

View File

@@ -170,7 +170,7 @@ This demo showcases how to build a RAG system for PDF/md documents using Leann.
- **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency - **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency
- **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional) - **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional)
- **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead - **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead
- **🚀 MLX Support** - Ultra-fast recompute with quantized embedding models, accelerating building and search by 10-100x ([minimal example](test/build_mlx_index.py)) - **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](test/build_mlx_index.py))
### 🎨 Developer Experience ### 🎨 Developer Experience

View File

@@ -190,16 +190,16 @@ class WeChatHistoryReader(BaseReader):
return False return False
def _concatenate_messages(self, messages: List[Dict], min_length: int = 128, max_length: int = 1000, def _concatenate_messages(self, messages: List[Dict], max_length: int = 128,
time_window_minutes: int = 30) -> List[Dict]: time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
""" """
Concatenate messages based on length and time rules. Concatenate messages based on length and time rules.
Args: Args:
messages: List of message dictionaries messages: List of message dictionaries
min_length: Minimum length for concatenated message groups
max_length: Maximum length for concatenated message groups max_length: Maximum length for concatenated message groups
time_window_minutes: Time window in minutes to group messages together time_window_minutes: Time window in minutes to group messages together
overlap_messages: Number of messages to overlap between consecutive groups
Returns: Returns:
List of concatenated message groups List of concatenated message groups
@@ -235,37 +235,46 @@ class WeChatHistoryReader(BaseReader):
time_diff_minutes = (create_time - last_timestamp) / 60 time_diff_minutes = (create_time - last_timestamp) / 60
if time_diff_minutes > time_window_minutes: if time_diff_minutes > time_window_minutes:
# Time gap too large, start new group # Time gap too large, start new group
if current_group and current_length >= min_length: if current_group:
concatenated_groups.append({ concatenated_groups.append({
'messages': current_group, 'messages': current_group,
'total_length': current_length, 'total_length': current_length,
'start_time': current_group[0].get('createTime', 0), 'start_time': current_group[0].get('createTime', 0),
'end_time': current_group[-1].get('createTime', 0) 'end_time': current_group[-1].get('createTime', 0)
}) })
current_group = [] # Keep last few messages for overlap
current_length = 0 if overlap_messages > 0 and len(current_group) > overlap_messages:
current_group = current_group[-overlap_messages:]
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
else:
current_group = []
current_length = 0
# Check length constraint # Check length constraint
message_length = len(readable_text) message_length = len(readable_text)
if current_length + message_length > max_length and current_group: if current_length + message_length > max_length and current_group:
# Current group would exceed max length, save it and start new # Current group would exceed max length, save it and start new
if current_length >= min_length: concatenated_groups.append({
concatenated_groups.append({ 'messages': current_group,
'messages': current_group, 'total_length': current_length,
'total_length': current_length, 'start_time': current_group[0].get('createTime', 0),
'start_time': current_group[0].get('createTime', 0), 'end_time': current_group[-1].get('createTime', 0)
'end_time': current_group[-1].get('createTime', 0) })
}) # Keep last few messages for overlap
current_group = [] if overlap_messages > 0 and len(current_group) > overlap_messages:
current_length = 0 current_group = current_group[-overlap_messages:]
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
else:
current_group = []
current_length = 0
# Add message to current group # Add message to current group
current_group.append(message) current_group.append(message)
current_length += message_length current_length += message_length
last_timestamp = create_time last_timestamp = create_time
# Add the last group if it meets minimum length # Add the last group if it exists
if current_group and current_length >= min_length: if current_group:
concatenated_groups.append({ concatenated_groups.append({
'messages': current_group, 'messages': current_group,
'total_length': current_length, 'total_length': current_length,
@@ -343,6 +352,12 @@ Contact: {contact_name}
Time Range: {start_time_str} - {end_time_str} Time Range: {start_time_str} - {end_time_str}
Messages ({len(messages)} messages, {message_group['total_length']} chars): Messages ({len(messages)} messages, {message_group['total_length']} chars):
{concatenated_text}
"""
doc_content = f"""
Contact: {contact_name}
{concatenated_text} {concatenated_text}
""" """
return doc_content return doc_content
@@ -358,16 +373,15 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
wechat_export_dir (str): Custom path to WeChat export directory. wechat_export_dir (str): Custom path to WeChat export directory.
include_non_text (bool): Whether to include non-text messages (images, emojis, etc.) include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
concatenate_messages (bool): Whether to concatenate messages based on length rules. concatenate_messages (bool): Whether to concatenate messages based on length rules.
min_length (int): Minimum length for concatenated message groups (default: 128).
max_length (int): Maximum length for concatenated message groups (default: 1000). max_length (int): Maximum length for concatenated message groups (default: 1000).
time_window_minutes (int): Time window in minutes to group messages together (default: 30). time_window_minutes (int): Time window in minutes to group messages together (default: 30).
overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
""" """
docs: List[Document] = [] docs: List[Document] = []
max_count = load_kwargs.get('max_count', 1000) max_count = load_kwargs.get('max_count', 1000)
wechat_export_dir = load_kwargs.get('wechat_export_dir', None) wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
include_non_text = load_kwargs.get('include_non_text', False) include_non_text = load_kwargs.get('include_non_text', False)
concatenate_messages = load_kwargs.get('concatenate_messages', False) concatenate_messages = load_kwargs.get('concatenate_messages', False)
min_length = load_kwargs.get('min_length', 128)
max_length = load_kwargs.get('max_length', 1000) max_length = load_kwargs.get('max_length', 1000)
time_window_minutes = load_kwargs.get('time_window_minutes', 30) time_window_minutes = load_kwargs.get('time_window_minutes', 30)
@@ -417,9 +431,9 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
# Concatenate messages based on rules # Concatenate messages based on rules
message_groups = self._concatenate_messages( message_groups = self._concatenate_messages(
readable_messages, readable_messages,
min_length=min_length,
max_length=max_length, max_length=max_length,
time_window_minutes=time_window_minutes time_window_minutes=time_window_minutes,
overlap_messages=2 # Keep 2 messages overlap between groups
) )
# Create documents from concatenated groups # Create documents from concatenated groups

View File

@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
documents = reader.load_data( documents = reader.load_data(
wechat_export_dir=str(export_dir), wechat_export_dir=str(export_dir),
max_count=max_count, max_count=max_count,
concatenate_messages=False, # Disable concatenation - one message per document concatenate_messages=True, # Disable concatenation - one message per document
) )
if documents: if documents:
print(f"Loaded {len(documents)} chat documents from {export_dir}") print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -222,9 +222,9 @@ async def query_leann_index(index_path: str, query: str):
print(f"You: {query}") print(f"You: {query}")
chat_response = chat.ask( chat_response = chat.ask(
query, query,
top_k=5, top_k=20,
recompute_beighbor_embeddings=True, recompute_beighbor_embeddings=True,
complexity=32, complexity=64,
beam_width=1, beam_width=1,
llm_config={ llm_config={
"type": "openai", "type": "openai",
@@ -252,7 +252,7 @@ async def main():
parser.add_argument( parser.add_argument(
"--index-dir", "--index-dir",
type=str, type=str,
default="./wechat_history_index_leann_test", default="./wechat_history_june19_test",
help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
) )
parser.add_argument( parser.add_argument(

View File

@@ -600,7 +600,7 @@ def create_embedding_server_thread(
chunk_ids = node_ids[i:end_idx] chunk_ids = node_ids[i:end_idx]
if embedding_mode == "mlx": if embedding_mode == "mlx":
embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name) embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name, batch_size=16)
elif embedding_mode == "openai": elif embedding_mode == "openai":
embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name) embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name)
else: # sentence-transformers else: # sentence-transformers
@@ -617,7 +617,7 @@ def create_embedding_server_thread(
print(f"INFO: Combined embeddings shape: {hidden.shape}") print(f"INFO: Combined embeddings shape: {hidden.shape}")
else: else:
if embedding_mode == "mlx": if embedding_mode == "mlx":
hidden = compute_embeddings_mlx(texts, model_name) hidden = compute_embeddings_mlx(texts, model_name, batch_size=16)
elif embedding_mode == "openai": elif embedding_mode == "openai":
hidden = compute_embeddings_openai(texts, model_name) hidden = compute_embeddings_openai(texts, model_name)
else: # sentence-transformers else: # sentence-transformers

View File

@@ -423,7 +423,7 @@ def create_hnsw_embedding_server(
from leann.api import compute_embeddings from leann.api import compute_embeddings
# Compute embeddings using MLX # Compute embeddings using MLX
embeddings = compute_embeddings(texts_batch, model_name, use_mlx=True) embeddings = compute_embeddings(texts_batch, model_name, mode="mlx", use_server=False)
print( print(
f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts" f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts"

View File

@@ -11,7 +11,8 @@ requires-python = ">=3.9"
license = { text = "MIT" } license = { text = "MIT" }
dependencies = [ dependencies = [
"numpy>=1.20.0" "numpy>=1.20.0",
"tqdm>=4.60.0"
] ]
[tool.setuptools.packages.find] [tool.setuptools.packages.find]

View File

@@ -21,7 +21,8 @@ def compute_embeddings(
chunks: List[str], chunks: List[str],
model_name: str, model_name: str,
mode: str = "sentence-transformers", mode: str = "sentence-transformers",
use_server: bool = True use_server: bool = True,
use_mlx: bool = False # Backward compatibility: if True, override mode to 'mlx'
) -> np.ndarray: ) -> np.ndarray:
""" """
Computes embeddings using different backends. Computes embeddings using different backends.
@@ -38,12 +39,16 @@ def compute_embeddings(
Returns: Returns:
numpy array of embeddings numpy array of embeddings
""" """
# Override mode for backward compatibility
if use_mlx:
mode = "mlx"
# Auto-detect mode based on model name if not explicitly set # Auto-detect mode based on model name if not explicitly set
if mode == "sentence-transformers" and model_name.startswith("text-embedding-"): if mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
mode = "openai" mode = "openai"
if mode == "mlx": if mode == "mlx":
return compute_embeddings_mlx(chunks, model_name) return compute_embeddings_mlx(chunks, model_name, batch_size=16)
elif mode == "openai": elif mode == "openai":
return compute_embeddings_openai(chunks, model_name) return compute_embeddings_openai(chunks, model_name)
elif mode == "sentence-transformers": elif mode == "sentence-transformers":
@@ -144,7 +149,7 @@ def _compute_embeddings_sentence_transformers_direct(chunks: List[str], model_na
# Generate embeddings # Generate embeddings
# give use an warning if OOM here means we need to turn down the batch size # give use an warning if OOM here means we need to turn down the batch size
embeddings = model.encode( embeddings = model.encode(
chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=8 chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=16
) )
return embeddings return embeddings
@@ -173,9 +178,17 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
max_batch_size = 100 # Conservative batch size max_batch_size = 100 # Conservative batch size
all_embeddings = [] all_embeddings = []
for i in range(0, len(chunks), max_batch_size): try:
from tqdm import tqdm
total_batches = (len(chunks) + max_batch_size - 1) // max_batch_size
batch_range = range(0, len(chunks), max_batch_size)
batch_iterator = tqdm(batch_range, desc="Computing embeddings", unit="batch", total=total_batches)
except ImportError:
# Fallback without progress bar
batch_iterator = range(0, len(chunks), max_batch_size)
for i in batch_iterator:
batch_chunks = chunks[i:i + max_batch_size] batch_chunks = chunks[i:i + max_batch_size]
print(f"INFO: Processing batch {i//max_batch_size + 1}/{(len(chunks) + max_batch_size - 1)//max_batch_size}")
try: try:
response = client.embeddings.create( response = client.embeddings.create(
@@ -193,42 +206,64 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
return embeddings return embeddings
def compute_embeddings_mlx(chunks: List[str], model_name: str) -> np.ndarray: def compute_embeddings_mlx(chunks: List[str], model_name: str, batch_size: int = 16) -> np.ndarray:
"""Computes embeddings using an MLX model.""" """Computes embeddings using an MLX model."""
try: try:
import mlx.core as mx import mlx.core as mx
from mlx_lm.utils import load from mlx_lm.utils import load
from tqdm import tqdm
except ImportError as e: except ImportError as e:
raise RuntimeError( raise RuntimeError(
"MLX or related libraries not available. Install with: uv pip install mlx mlx-lm" "MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
) from e ) from e
print( print(
f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}'..." f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..."
) )
# Load model and tokenizer # Load model and tokenizer
model, tokenizer = load(model_name) model, tokenizer = load(model_name)
# Process each chunk # Process chunks in batches with progress bar
all_embeddings = [] all_embeddings = []
for chunk in chunks:
# Tokenize try:
token_ids = tokenizer.encode(chunk) # type: ignore from tqdm import tqdm
batch_iterator = tqdm(range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch")
except ImportError:
batch_iterator = range(0, len(chunks), batch_size)
for i in batch_iterator:
batch_chunks = chunks[i:i + batch_size]
# Tokenize all chunks in the batch
batch_token_ids = []
for chunk in batch_chunks:
token_ids = tokenizer.encode(chunk) # type: ignore
batch_token_ids.append(token_ids)
# Pad sequences to the same length for batch processing
max_length = max(len(ids) for ids in batch_token_ids)
padded_token_ids = []
for token_ids in batch_token_ids:
# Pad with tokenizer.pad_token_id or 0
padded = token_ids + [0] * (max_length - len(token_ids))
padded_token_ids.append(padded)
# Convert to MLX array with batch dimension
input_ids = mx.array(padded_token_ids)
# Convert to MLX array and add batch dimension # Get embeddings for the batch
input_ids = mx.array([token_ids])
# Get embeddings
embeddings = model(input_ids) embeddings = model(input_ids)
# Mean pooling (since we only have one sequence, just take the mean) # Mean pooling for each sequence in the batch
pooled = embeddings.mean(axis=1) # Shape: (1, hidden_size) pooled = embeddings.mean(axis=1) # Shape: (batch_size, hidden_size)
# Convert individual embedding to numpy via list (to handle bfloat16) # Convert batch embeddings to numpy
pooled_list = pooled[0].tolist() # Remove batch dimension and convert to list for j in range(len(batch_chunks)):
pooled_numpy = np.array(pooled_list, dtype=np.float32) pooled_list = pooled[j].tolist() # Convert to list
all_embeddings.append(pooled_numpy) pooled_numpy = np.array(pooled_list, dtype=np.float32)
all_embeddings.append(pooled_numpy)
# Stack numpy arrays # Stack numpy arrays
return np.stack(all_embeddings) return np.stack(all_embeddings)
@@ -294,6 +329,8 @@ class LeannBuilder:
self.dimensions = dimensions self.dimensions = dimensions
self.embedding_mode = embedding_mode self.embedding_mode = embedding_mode
self.backend_kwargs = backend_kwargs self.backend_kwargs = backend_kwargs
if 'mlx' in self.embedding_model:
self.embedding_mode = "mlx"
self.chunks: List[Dict[str, Any]] = [] self.chunks: List[Dict[str, Any]] = []
def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None): def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
@@ -318,7 +355,13 @@ class LeannBuilder:
offset_file = index_dir / f"{index_name}.passages.idx" offset_file = index_dir / f"{index_name}.passages.idx"
offset_map = {} offset_map = {}
with open(passages_file, "w", encoding="utf-8") as f: with open(passages_file, "w", encoding="utf-8") as f:
for chunk in self.chunks: try:
from tqdm import tqdm
chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk")
except ImportError:
chunk_iterator = self.chunks
for chunk in chunk_iterator:
offset = f.tell() offset = f.tell()
json.dump( json.dump(
{ {

View File

@@ -175,7 +175,7 @@ class EmbeddingServerManager:
self.backend_module_name = backend_module_name self.backend_module_name = backend_module_name
self.server_process: Optional[subprocess.Popen] = None self.server_process: Optional[subprocess.Popen] = None
self.server_port: Optional[int] = None self.server_port: Optional[int] = None
# atexit.register(self.stop_server) atexit.register(self.stop_server)
def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool: def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool:
""" """

View File

@@ -264,7 +264,7 @@ def run_mlx_benchmark():
} }
config = BenchmarkConfig( config = BenchmarkConfig(
model_path="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ", model_path="mlx-community/all-MiniLM-L6-v2-4bit",
use_mlx=True use_mlx=True
) )