diff --git a/README.md b/README.md index d78b290..9da3f85 100755 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ This demo showcases how to build a RAG system for PDF/md documents using Leann. - **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency - **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional) - **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead -- **🚀 MLX Support** - Ultra-fast recompute with quantized embedding models, accelerating building and search by 10-100x ([minimal example](test/build_mlx_index.py)) +- **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](test/build_mlx_index.py)) ### 🎨 Developer Experience diff --git a/examples/history_data/wechat_history.py b/examples/history_data/wechat_history.py index d28854c..c3aee02 100644 --- a/examples/history_data/wechat_history.py +++ b/examples/history_data/wechat_history.py @@ -190,16 +190,16 @@ class WeChatHistoryReader(BaseReader): return False - def _concatenate_messages(self, messages: List[Dict], min_length: int = 128, max_length: int = 1000, - time_window_minutes: int = 30) -> List[Dict]: + def _concatenate_messages(self, messages: List[Dict], max_length: int = 128, + time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]: """ Concatenate messages based on length and time rules. Args: messages: List of message dictionaries - min_length: Minimum length for concatenated message groups max_length: Maximum length for concatenated message groups time_window_minutes: Time window in minutes to group messages together + overlap_messages: Number of messages to overlap between consecutive groups Returns: List of concatenated message groups @@ -235,37 +235,46 @@ class WeChatHistoryReader(BaseReader): time_diff_minutes = (create_time - last_timestamp) / 60 if time_diff_minutes > time_window_minutes: # Time gap too large, start new group - if current_group and current_length >= min_length: + if current_group: concatenated_groups.append({ 'messages': current_group, 'total_length': current_length, 'start_time': current_group[0].get('createTime', 0), 'end_time': current_group[-1].get('createTime', 0) }) - current_group = [] - current_length = 0 + # Keep last few messages for overlap + if overlap_messages > 0 and len(current_group) > overlap_messages: + current_group = current_group[-overlap_messages:] + current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group) + else: + current_group = [] + current_length = 0 # Check length constraint message_length = len(readable_text) if current_length + message_length > max_length and current_group: # Current group would exceed max length, save it and start new - if current_length >= min_length: - concatenated_groups.append({ - 'messages': current_group, - 'total_length': current_length, - 'start_time': current_group[0].get('createTime', 0), - 'end_time': current_group[-1].get('createTime', 0) - }) - current_group = [] - current_length = 0 + concatenated_groups.append({ + 'messages': current_group, + 'total_length': current_length, + 'start_time': current_group[0].get('createTime', 0), + 'end_time': current_group[-1].get('createTime', 0) + }) + # Keep last few messages for overlap + if overlap_messages > 0 and len(current_group) > overlap_messages: + current_group = current_group[-overlap_messages:] + current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group) + else: + current_group = [] + current_length = 0 # Add message to current group current_group.append(message) current_length += message_length last_timestamp = create_time - # Add the last group if it meets minimum length - if current_group and current_length >= min_length: + # Add the last group if it exists + if current_group: concatenated_groups.append({ 'messages': current_group, 'total_length': current_length, @@ -343,6 +352,12 @@ Contact: {contact_name} Time Range: {start_time_str} - {end_time_str} Messages ({len(messages)} messages, {message_group['total_length']} chars): +{concatenated_text} +""" + + doc_content = f""" +Contact: {contact_name} + {concatenated_text} """ return doc_content @@ -358,16 +373,15 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars): wechat_export_dir (str): Custom path to WeChat export directory. include_non_text (bool): Whether to include non-text messages (images, emojis, etc.) concatenate_messages (bool): Whether to concatenate messages based on length rules. - min_length (int): Minimum length for concatenated message groups (default: 128). max_length (int): Maximum length for concatenated message groups (default: 1000). time_window_minutes (int): Time window in minutes to group messages together (default: 30). + overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2). """ docs: List[Document] = [] max_count = load_kwargs.get('max_count', 1000) wechat_export_dir = load_kwargs.get('wechat_export_dir', None) include_non_text = load_kwargs.get('include_non_text', False) concatenate_messages = load_kwargs.get('concatenate_messages', False) - min_length = load_kwargs.get('min_length', 128) max_length = load_kwargs.get('max_length', 1000) time_window_minutes = load_kwargs.get('time_window_minutes', 30) @@ -417,9 +431,9 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars): # Concatenate messages based on rules message_groups = self._concatenate_messages( readable_messages, - min_length=min_length, max_length=max_length, - time_window_minutes=time_window_minutes + time_window_minutes=time_window_minutes, + overlap_messages=2 # Keep 2 messages overlap between groups ) # Create documents from concatenated groups diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py index f92dbb7..3d1147e 100644 --- a/examples/wechat_history_reader_leann.py +++ b/examples/wechat_history_reader_leann.py @@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports( documents = reader.load_data( wechat_export_dir=str(export_dir), max_count=max_count, - concatenate_messages=False, # Disable concatenation - one message per document + concatenate_messages=True, # Disable concatenation - one message per document ) if documents: print(f"Loaded {len(documents)} chat documents from {export_dir}") @@ -222,9 +222,9 @@ async def query_leann_index(index_path: str, query: str): print(f"You: {query}") chat_response = chat.ask( query, - top_k=5, + top_k=20, recompute_beighbor_embeddings=True, - complexity=32, + complexity=64, beam_width=1, llm_config={ "type": "openai", @@ -252,7 +252,7 @@ async def main(): parser.add_argument( "--index-dir", type=str, - default="./wechat_history_index_leann_test", + default="./wechat_history_june19_test", help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", ) parser.add_argument( diff --git a/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py b/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py index e3f719a..1096ae5 100644 --- a/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py +++ b/packages/leann-backend-diskann/leann_backend_diskann/embedding_server.py @@ -600,7 +600,7 @@ def create_embedding_server_thread( chunk_ids = node_ids[i:end_idx] if embedding_mode == "mlx": - embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name) + embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name, batch_size=16) elif embedding_mode == "openai": embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name) else: # sentence-transformers @@ -617,7 +617,7 @@ def create_embedding_server_thread( print(f"INFO: Combined embeddings shape: {hidden.shape}") else: if embedding_mode == "mlx": - hidden = compute_embeddings_mlx(texts, model_name) + hidden = compute_embeddings_mlx(texts, model_name, batch_size=16) elif embedding_mode == "openai": hidden = compute_embeddings_openai(texts, model_name) else: # sentence-transformers diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py index db302fd..579f2bb 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py @@ -423,7 +423,7 @@ def create_hnsw_embedding_server( from leann.api import compute_embeddings # Compute embeddings using MLX - embeddings = compute_embeddings(texts_batch, model_name, use_mlx=True) + embeddings = compute_embeddings(texts_batch, model_name, mode="mlx", use_server=False) print( f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts" diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index 7936d53..7f64793 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -11,7 +11,8 @@ requires-python = ">=3.9" license = { text = "MIT" } dependencies = [ - "numpy>=1.20.0" + "numpy>=1.20.0", + "tqdm>=4.60.0" ] [tool.setuptools.packages.find] diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index c6e43ab..19546a6 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -21,7 +21,8 @@ def compute_embeddings( chunks: List[str], model_name: str, mode: str = "sentence-transformers", - use_server: bool = True + use_server: bool = True, + use_mlx: bool = False # Backward compatibility: if True, override mode to 'mlx' ) -> np.ndarray: """ Computes embeddings using different backends. @@ -38,12 +39,16 @@ def compute_embeddings( Returns: numpy array of embeddings """ + # Override mode for backward compatibility + if use_mlx: + mode = "mlx" + # Auto-detect mode based on model name if not explicitly set if mode == "sentence-transformers" and model_name.startswith("text-embedding-"): mode = "openai" if mode == "mlx": - return compute_embeddings_mlx(chunks, model_name) + return compute_embeddings_mlx(chunks, model_name, batch_size=16) elif mode == "openai": return compute_embeddings_openai(chunks, model_name) elif mode == "sentence-transformers": @@ -144,7 +149,7 @@ def _compute_embeddings_sentence_transformers_direct(chunks: List[str], model_na # Generate embeddings # give use an warning if OOM here means we need to turn down the batch size embeddings = model.encode( - chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=8 + chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=16 ) return embeddings @@ -173,9 +178,17 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray: max_batch_size = 100 # Conservative batch size all_embeddings = [] - for i in range(0, len(chunks), max_batch_size): + try: + from tqdm import tqdm + total_batches = (len(chunks) + max_batch_size - 1) // max_batch_size + batch_range = range(0, len(chunks), max_batch_size) + batch_iterator = tqdm(batch_range, desc="Computing embeddings", unit="batch", total=total_batches) + except ImportError: + # Fallback without progress bar + batch_iterator = range(0, len(chunks), max_batch_size) + + for i in batch_iterator: batch_chunks = chunks[i:i + max_batch_size] - print(f"INFO: Processing batch {i//max_batch_size + 1}/{(len(chunks) + max_batch_size - 1)//max_batch_size}") try: response = client.embeddings.create( @@ -193,42 +206,64 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray: return embeddings -def compute_embeddings_mlx(chunks: List[str], model_name: str) -> np.ndarray: +def compute_embeddings_mlx(chunks: List[str], model_name: str, batch_size: int = 16) -> np.ndarray: """Computes embeddings using an MLX model.""" try: import mlx.core as mx from mlx_lm.utils import load + from tqdm import tqdm except ImportError as e: raise RuntimeError( "MLX or related libraries not available. Install with: uv pip install mlx mlx-lm" ) from e print( - f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}'..." + f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..." ) # Load model and tokenizer model, tokenizer = load(model_name) - # Process each chunk + # Process chunks in batches with progress bar all_embeddings = [] - for chunk in chunks: - # Tokenize - token_ids = tokenizer.encode(chunk) # type: ignore + + try: + from tqdm import tqdm + batch_iterator = tqdm(range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch") + except ImportError: + batch_iterator = range(0, len(chunks), batch_size) + + for i in batch_iterator: + batch_chunks = chunks[i:i + batch_size] + + # Tokenize all chunks in the batch + batch_token_ids = [] + for chunk in batch_chunks: + token_ids = tokenizer.encode(chunk) # type: ignore + batch_token_ids.append(token_ids) + + # Pad sequences to the same length for batch processing + max_length = max(len(ids) for ids in batch_token_ids) + padded_token_ids = [] + for token_ids in batch_token_ids: + # Pad with tokenizer.pad_token_id or 0 + padded = token_ids + [0] * (max_length - len(token_ids)) + padded_token_ids.append(padded) + + # Convert to MLX array with batch dimension + input_ids = mx.array(padded_token_ids) - # Convert to MLX array and add batch dimension - input_ids = mx.array([token_ids]) - - # Get embeddings + # Get embeddings for the batch embeddings = model(input_ids) - # Mean pooling (since we only have one sequence, just take the mean) - pooled = embeddings.mean(axis=1) # Shape: (1, hidden_size) + # Mean pooling for each sequence in the batch + pooled = embeddings.mean(axis=1) # Shape: (batch_size, hidden_size) - # Convert individual embedding to numpy via list (to handle bfloat16) - pooled_list = pooled[0].tolist() # Remove batch dimension and convert to list - pooled_numpy = np.array(pooled_list, dtype=np.float32) - all_embeddings.append(pooled_numpy) + # Convert batch embeddings to numpy + for j in range(len(batch_chunks)): + pooled_list = pooled[j].tolist() # Convert to list + pooled_numpy = np.array(pooled_list, dtype=np.float32) + all_embeddings.append(pooled_numpy) # Stack numpy arrays return np.stack(all_embeddings) @@ -294,6 +329,8 @@ class LeannBuilder: self.dimensions = dimensions self.embedding_mode = embedding_mode self.backend_kwargs = backend_kwargs + if 'mlx' in self.embedding_model: + self.embedding_mode = "mlx" self.chunks: List[Dict[str, Any]] = [] def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None): @@ -318,7 +355,13 @@ class LeannBuilder: offset_file = index_dir / f"{index_name}.passages.idx" offset_map = {} with open(passages_file, "w", encoding="utf-8") as f: - for chunk in self.chunks: + try: + from tqdm import tqdm + chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk") + except ImportError: + chunk_iterator = self.chunks + + for chunk in chunk_iterator: offset = f.tell() json.dump( { diff --git a/packages/leann-core/src/leann/embedding_server_manager.py b/packages/leann-core/src/leann/embedding_server_manager.py index 303adac..2022262 100644 --- a/packages/leann-core/src/leann/embedding_server_manager.py +++ b/packages/leann-core/src/leann/embedding_server_manager.py @@ -175,7 +175,7 @@ class EmbeddingServerManager: self.backend_module_name = backend_module_name self.server_process: Optional[subprocess.Popen] = None self.server_port: Optional[int] = None - # atexit.register(self.stop_server) + atexit.register(self.stop_server) def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool: """ diff --git a/test/simple_mac_tpt_test.py b/test/simple_mac_tpt_test.py index 6aaac13..2b84cb4 100644 --- a/test/simple_mac_tpt_test.py +++ b/test/simple_mac_tpt_test.py @@ -264,7 +264,7 @@ def run_mlx_benchmark(): } config = BenchmarkConfig( - model_path="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ", + model_path="mlx-community/all-MiniLM-L6-v2-4bit", use_mlx=True )