Merge branch 'main' into readme-polish

2025-07-19 21:47:17 -07:00
parent 6e755f0402 e728449b8f
commit f83c97e6d1
11 changed files with 225 additions and 9667 deletions
@@ -363,6 +363,28 @@ If you find Leann useful, please cite:
 }
 ```

+## ✨ Features
+
+### 🔥 Core Features
+
+- **🔄 Real-time Embeddings** - Eliminate heavy embedding storage with dynamic computation using optimized ZMQ servers and highly optimized search paradigm (overlapping and batching) with highly optimized embedding engine
+- **📈 Scalable Architecture** - Handles millions of documents on consumer hardware; the larger your dataset, the more LEANN can save
+- **🎯 Graph Pruning** - Advanced techniques to minimize the storage overhead of vector search to a limited footprint
+- **🏗️ Pluggable Backends** - DiskANN, HNSW/FAISS with unified API
+
+### 🛠️ Technical Highlights
+- **🔄 Recompute Mode** - Highest accuracy scenarios while eliminating vector storage overhead
+- **⚡ Zero-copy Operations** - Minimize IPC overhead by transferring distances instead of embeddings
+- **🚀 High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency
+- **🎯 Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional)
+- **💾 Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead
+- **🚀 MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](test/build_mlx_index.py))
+
+### 🎨 Developer Experience
+
+- **Simple Python API** - Get started in minutes
+- **Extensible backend system** - Easy to add new algorithms
+- **Comprehensive examples** - From basic usage to production deployment

 ## 🤝 Contributing

@@ -190,16 +190,16 @@ class WeChatHistoryReader(BaseReader):
        
        return False
    
-    def _concatenate_messages(self, messages: List[Dict], min_length: int = 128, max_length: int = 1000, 
-                             time_window_minutes: int = 30) -> List[Dict]:
+    def _concatenate_messages(self, messages: List[Dict], max_length: int = 128, 
+                             time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
        """
        Concatenate messages based on length and time rules.
        
        Args:
            messages: List of message dictionaries
-            min_length: Minimum length for concatenated message groups
            max_length: Maximum length for concatenated message groups
            time_window_minutes: Time window in minutes to group messages together
+            overlap_messages: Number of messages to overlap between consecutive groups
            
        Returns:
            List of concatenated message groups
@@ -235,37 +235,46 @@ class WeChatHistoryReader(BaseReader):
                time_diff_minutes = (create_time - last_timestamp) / 60
                if time_diff_minutes > time_window_minutes:
                    # Time gap too large, start new group
-                    if current_group and current_length >= min_length:
+                    if current_group:
                        concatenated_groups.append({
                            'messages': current_group,
                            'total_length': current_length,
                            'start_time': current_group[0].get('createTime', 0),
                            'end_time': current_group[-1].get('createTime', 0)
                        })
-                    current_group = []
-                    current_length = 0
+                        # Keep last few messages for overlap
+                        if overlap_messages > 0 and len(current_group) > overlap_messages:
+                            current_group = current_group[-overlap_messages:]
+                            current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
+                        else:
+                            current_group = []
+                            current_length = 0
            
            # Check length constraint
            message_length = len(readable_text)
            if current_length + message_length > max_length and current_group:
                # Current group would exceed max length, save it and start new
-                if current_length >= min_length:
-                    concatenated_groups.append({
-                        'messages': current_group,
-                        'total_length': current_length,
-                        'start_time': current_group[0].get('createTime', 0),
-                        'end_time': current_group[-1].get('createTime', 0)
-                    })
-                current_group = []
-                current_length = 0
+                concatenated_groups.append({
+                    'messages': current_group,
+                    'total_length': current_length,
+                    'start_time': current_group[0].get('createTime', 0),
+                    'end_time': current_group[-1].get('createTime', 0)
+                })
+                # Keep last few messages for overlap
+                if overlap_messages > 0 and len(current_group) > overlap_messages:
+                    current_group = current_group[-overlap_messages:]
+                    current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
+                else:
+                    current_group = []
+                    current_length = 0
            
            # Add message to current group
            current_group.append(message)
            current_length += message_length
            last_timestamp = create_time
        
-        # Add the last group if it meets minimum length
-        if current_group and current_length >= min_length:
+        # Add the last group if it exists
+        if current_group:
            concatenated_groups.append({
                'messages': current_group,
                'total_length': current_length,
@@ -343,6 +352,12 @@ Contact: {contact_name}
 Time Range: {start_time_str} - {end_time_str}
 Messages ({len(messages)} messages, {message_group['total_length']} chars):

+{concatenated_text}
+"""
+        
+        doc_content = f"""
+Contact: {contact_name}
+
 {concatenated_text}
 """
        return doc_content
@@ -358,16 +373,15 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
                wechat_export_dir (str): Custom path to WeChat export directory.
                include_non_text (bool): Whether to include non-text messages (images, emojis, etc.)
                concatenate_messages (bool): Whether to concatenate messages based on length rules.
-                min_length (int): Minimum length for concatenated message groups (default: 128).
                max_length (int): Maximum length for concatenated message groups (default: 1000).
                time_window_minutes (int): Time window in minutes to group messages together (default: 30).
+                overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
        """
        docs: List[Document] = []
        max_count = load_kwargs.get('max_count', 1000)
        wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
        include_non_text = load_kwargs.get('include_non_text', False)
        concatenate_messages = load_kwargs.get('concatenate_messages', False)
-        min_length = load_kwargs.get('min_length', 128)
        max_length = load_kwargs.get('max_length', 1000)
        time_window_minutes = load_kwargs.get('time_window_minutes', 30)
        
@@ -417,9 +431,9 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
                        # Concatenate messages based on rules
                        message_groups = self._concatenate_messages(
                            readable_messages, 
-                            min_length=min_length, 
                            max_length=max_length, 
-                            time_window_minutes=time_window_minutes
+                            time_window_minutes=time_window_minutes,
+                            overlap_messages=2  # Keep 2 messages overlap between groups
                        )
                        
                        # Create documents from concatenated groups
@@ -52,7 +52,7 @@ def create_leann_index_from_multiple_wechat_exports(
                documents = reader.load_data(
                    wechat_export_dir=str(export_dir),
                    max_count=max_count,
-                    concatenate_messages=False,  # Disable concatenation - one message per document
+                    concatenate_messages=True,  # Disable concatenation - one message per document
                )
                if documents:
                    print(f"Loaded {len(documents)} chat documents from {export_dir}")
@@ -222,9 +222,9 @@ async def query_leann_index(index_path: str, query: str):
    print(f"You: {query}")
    chat_response = chat.ask(
        query,
-        top_k=5,
+        top_k=20,
        recompute_beighbor_embeddings=True,
-        complexity=32,
+        complexity=64,
        beam_width=1,
        llm_config={
            "type": "openai",
@@ -252,7 +252,7 @@ async def main():
    parser.add_argument(
        "--index-dir",
        type=str,
-        default="./wechat_history_index_leann_test",
+        default="./wechat_history_june19_test",
        help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)",
    )
    parser.add_argument(
@@ -175,13 +175,13 @@ def create_embedding_server_thread(
    enable_warmup: bool = False,
 ):
    """
-    在当前线程中创建并运行 embedding server
-    这个函数设计为在单独的线程中调用
+    Create and run embedding server in the current thread
+    This function is designed to be called in a separate thread
    """
    logger.info(f"Initializing embedding server thread on port {zmq_port}")
    
    try:
-        # 检查端口是否已被占用
+        # Check if port is already occupied
        import socket
        def check_port(port):
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -212,11 +212,11 @@ def create_embedding_server_thread(
            cuda_available = False
            mps_available = False
        elif embedding_mode == "sentence-transformers":
-            # 初始化模型
+            # Initialize model
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
            import torch

-            # 选择设备
+            # Select device
            mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
            cuda_available = torch.cuda.is_available()
            
@@ -230,11 +230,11 @@ def create_embedding_server_thread(
                device = torch.device("cpu")
                logger.info("Using CPU device")
            
-            # 加载模型
+            # Load model
            logger.info(f"Loading model {model_name}")
            model = AutoModel.from_pretrained(model_name).to(device).eval()

-            # 优化模型
+            # Optimize model
            if cuda_available or mps_available:
                try:
                    model = model.half()
@@ -324,7 +324,7 @@ def create_embedding_server_thread(
                print(f"Error during Protobuf ZMQ warmup: {e}")

        class DeviceTimer:
-            """设备计时器"""
+            """Device timer"""
            def __init__(self, name="", device=device):
                self.name = name
                self.device = device
@@ -369,60 +369,63 @@ def create_embedding_server_thread(
                    return self.end_time - self.start_time

            def print_elapsed(self):
-                print(f"Time taken for {self.name}: {self.elapsed_time():.6f} seconds")
+                elapsed = self.elapsed_time()
+                print(f"[{self.name}] Elapsed time: {elapsed:.3f}s")

        def process_batch_pytorch(texts_batch, ids_batch, missing_ids):
-            """处理文本批次"""
-            batch_size = len(texts_batch)
-            logger.info(f"Processing batch of size {batch_size}")
+            """Process text batch"""
+            if not texts_batch:
+                return np.array([])

-            tokenize_timer = DeviceTimer("tokenization (batch)", device)
-            to_device_timer = DeviceTimer("transfer to device (batch)", device)
-            embed_timer = DeviceTimer("embedding (batch)", device)
-            pool_timer = DeviceTimer("mean pooling (batch)", device)
+            # Filter out empty texts and their corresponding IDs
+            valid_texts = []
+            valid_ids = []
+            for i, text in enumerate(texts_batch):
+                if text.strip():  # Only include non-empty texts
+                    valid_texts.append(text)
+                    valid_ids.append(ids_batch[i])

-            with tokenize_timer.timing():
-                encoded_batch = tokenizer.batch_encode_plus(
-                    texts_batch,
-                    padding="max_length",
+            if not valid_texts:
+                print("WARNING: No valid texts in batch")
+                return np.array([])
+
+            # Tokenize
+            token_timer = DeviceTimer("tokenization")
+            with token_timer.timing():
+                inputs = tokenizer(
+                    valid_texts,
+                    padding=True,
                    truncation=True,
-                    max_length=256,
-                    return_tensors="pt",
-                    return_token_type_ids=False,
-                )
-            tokenize_timer.print_elapsed()
+                    max_length=512,
+                    return_tensors="pt"
+                ).to(device)

-            seq_length = encoded_batch["input_ids"].size(1)
-            print(f"Batch size: {batch_size}, Sequence length: {seq_length}")
-
-            with to_device_timer.timing():
-                enc = {k: v.to(device) for k, v in encoded_batch.items()}
-            to_device_timer.print_elapsed()
-
-            with torch.no_grad():
-                with embed_timer.timing():
-                    out = model(enc["input_ids"], enc["attention_mask"])
-                embed_timer.print_elapsed()
-
-                with pool_timer.timing():
-                    hidden_states = out.last_hidden_state if hasattr(out, "last_hidden_state") else out
-                    mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
+            # Compute embeddings
+            embed_timer = DeviceTimer("embedding computation")
+            with embed_timer.timing():
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    hidden_states = outputs.last_hidden_state
+                    
+                    # Mean pooling
+                    attention_mask = inputs['attention_mask']
+                    mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
                    sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
                    batch_embeddings = sum_embeddings / sum_mask
-                pool_timer.print_elapsed()
+                embed_timer.print_elapsed()

            return batch_embeddings.cpu().numpy()

-        # ZMQ server 主循环 - 修改为REP套接字
+        # ZMQ server main loop - modified to use REP socket
        context = zmq.Context()
-        socket = context.socket(zmq.ROUTER)  # 改为REP套接字
+        socket = context.socket(zmq.ROUTER)  # Changed to REP socket
        socket.bind(f"tcp://127.0.0.1:{zmq_port}")
        print(f"INFO: ZMQ ROUTER server listening on port {zmq_port}")

-        # 设置超时
-        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5秒接收超时
-        socket.setsockopt(zmq.SNDTIMEO, 300000)  # 300秒发送超时
+        # Set timeouts
+        socket.setsockopt(zmq.RCVTIMEO, 5000)  # 5 second receive timeout
+        socket.setsockopt(zmq.SNDTIMEO, 300000)  # 300 second send timeout

        from . import embedding_pb2

@@ -442,18 +445,18 @@ def create_embedding_server_thread(
            try:
                parts = socket.recv_multipart()

-                # --- 恢复稳健的消息格式判断 ---
-                # 必须检查 parts 的长度，避免 IndexError
+                # --- Restore robust message format detection ---
+                # Must check parts length to avoid IndexError
                if len(parts) >= 3:
                    identity = parts[0]
-                    # empty = parts[1]  # 中间的空帧我们通常不关心
+                    # empty = parts[1]  # We usually don't care about the middle empty frame
                    message = parts[2]
                elif len(parts) == 2:
-                    # 也能处理没有空帧的情况
+                    # Can also handle cases without empty frame
                    identity = parts[0]
                    message = parts[1]
                else:
-                    # 如果收到格式错误的消息，打印警告并忽略它，而不是崩溃
+                    # If received message format is wrong, print warning and ignore it instead of crashing
                    print(f"WARNING: Received unexpected message format with {len(parts)} parts. Ignoring.")
                    continue
                print(f"INFO: Received ZMQ request from client {identity.hex()[:8]}, size {len(message)} bytes")
@@ -555,17 +558,17 @@ def create_embedding_server_thread(
                e2e_start = time.time()
                lookup_timer = DeviceTimer("text lookup")

-                # 解析请求
+                # Parse request
                req_proto = embedding_pb2.NodeEmbeddingRequest()
                req_proto.ParseFromString(message)
                node_ids = req_proto.node_ids
                print(f"INFO: Request for {len(node_ids)} node embeddings: {list(node_ids)}")

-                # 添加调试信息
+                # Add debug information
                if len(node_ids) > 0:
                    print(f"DEBUG: Node ID range: {min(node_ids)} to {max(node_ids)}")
                
-                # 查找文本
+                # Look up texts
                texts = []
                missing_ids = []
                with lookup_timer.timing():
@@ -575,8 +578,8 @@ def create_embedding_server_thread(
                        if txt:
                            texts.append(txt)
                        else:
-                            # 如果文本为空，我们仍然需要一个占位符来进行批处理，
-                            # 但将其ID记录为缺失
+                            # If text is empty, we still need a placeholder for batch processing,
+                            # but record its ID as missing
                            texts.append("") 
                            missing_ids.append(nid)
                lookup_timer.print_elapsed()
@@ -584,7 +587,7 @@ def create_embedding_server_thread(
                if missing_ids:
                    print(f"WARNING: Missing passages for IDs: {missing_ids}")

-                # 处理批次
+                # Process batch
                total_size = len(texts)
                print(f"INFO: Total batch size: {total_size}, max_batch_size: {max_batch_size}")
                
@@ -600,7 +603,7 @@ def create_embedding_server_thread(
                        chunk_ids = node_ids[i:end_idx]
                        
                        if embedding_mode == "mlx":
-                            embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name)
+                            embeddings_chunk = compute_embeddings_mlx(chunk_texts, model_name, batch_size=16)
                        elif embedding_mode == "openai":
                            embeddings_chunk = compute_embeddings_openai(chunk_texts, model_name)
                        else:  # sentence-transformers
@@ -617,13 +620,13 @@ def create_embedding_server_thread(
                    print(f"INFO: Combined embeddings shape: {hidden.shape}")
                else:
                    if embedding_mode == "mlx":
-                        hidden = compute_embeddings_mlx(texts, model_name)
+                        hidden = compute_embeddings_mlx(texts, model_name, batch_size=16)
                    elif embedding_mode == "openai":
                        hidden = compute_embeddings_openai(texts, model_name)
                    else:  # sentence-transformers
                        hidden = process_batch_pytorch(texts, node_ids, missing_ids)

-                # 序列化响应
+                # Serialize response
                ser_start = time.time()

                resp_proto = embedding_pb2.NodeEmbeddingResponse()
@@ -635,7 +638,7 @@ def create_embedding_server_thread(

                response_data = resp_proto.SerializeToString()
                
-                # REP 套接字发送单个响应
+                # REP socket sends a single response
                socket.send_multipart([identity, b'', response_data])

                ser_end = time.time()
@@ -656,11 +659,11 @@ def create_embedding_server_thread(
            except Exception as e:
                print(f"ERROR: Error in ZMQ server: {e}")
                try:
-                    # 发送空响应以维持REQ-REP状态
+                    # Send empty response to maintain REQ-REP state
                    empty_resp = embedding_pb2.NodeEmbeddingResponse()
                    socket.send(empty_resp.SerializeToString())
                except:
-                    # 如果发送失败，重新创建socket
+                    # If sending fails, recreate socket
                    socket.close()
                    socket = context.socket(zmq.REP)
                    socket.bind(f"tcp://127.0.0.1:{zmq_port}")
@@ -423,7 +423,7 @@ def create_hnsw_embedding_server(
            from leann.api import compute_embeddings

            # Compute embeddings using MLX
-            embeddings = compute_embeddings(texts_batch, model_name, use_mlx=True)
+            embeddings = compute_embeddings(texts_batch, model_name, mode="mlx", use_server=False)

            print(
                f"[leann_backend_hnsw.hnsw_embedding_server LOG]: MLX embeddings computed for {len(texts_batch)} texts"
@@ -11,7 +11,8 @@ requires-python = ">=3.9"
 license = { text = "MIT" }

 dependencies = [
-    "numpy>=1.20.0"
+    "numpy>=1.20.0",
+    "tqdm>=4.60.0"
 ]

 [tool.setuptools.packages.find]
@@ -22,6 +22,7 @@ def compute_embeddings(
    model_name: str,
    mode: str = "sentence-transformers",
    use_server: bool = True,
+    use_mlx: bool = False  # Backward compatibility: if True, override mode to 'mlx',
 ) -> np.ndarray:
    """
    Computes embeddings using different backends.
@@ -38,12 +39,16 @@ def compute_embeddings(
    Returns:
        numpy array of embeddings
    """
+    # Override mode for backward compatibility
+    if use_mlx:
+        mode = "mlx"
+
    # Auto-detect mode based on model name if not explicitly set
    if mode == "sentence-transformers" and model_name.startswith("text-embedding-"):
        mode = "openai"

    if mode == "mlx":
-        return compute_embeddings_mlx(chunks, model_name)
+        return compute_embeddings_mlx(chunks, model_name, batch_size=16)
    elif mode == "openai":
        return compute_embeddings_openai(chunks, model_name)
    elif mode == "sentence-transformers":
@@ -158,7 +163,7 @@ def _compute_embeddings_sentence_transformers_direct(
    # Generate embeddings
    # give use an warning if OOM here means we need to turn down the batch size
    embeddings = model.encode(
-        chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=8
+        chunks, convert_to_numpy=True, show_progress_bar=True, batch_size=16
    )

    return embeddings
@@ -188,13 +193,19 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
    # OpenAI has a limit on batch size and input length
    max_batch_size = 100  # Conservative batch size
    all_embeddings = []
-
-    for i in range(0, len(chunks), max_batch_size):
-        batch_chunks = chunks[i : i + max_batch_size]
-        print(
-            f"INFO: Processing batch {i // max_batch_size + 1}/{(len(chunks) + max_batch_size - 1) // max_batch_size}"
-        )
-
+    
+    try:
+        from tqdm import tqdm
+        total_batches = (len(chunks) + max_batch_size - 1) // max_batch_size
+        batch_range = range(0, len(chunks), max_batch_size)
+        batch_iterator = tqdm(batch_range, desc="Computing embeddings", unit="batch", total=total_batches)
+    except ImportError:
+        # Fallback without progress bar
+        batch_iterator = range(0, len(chunks), max_batch_size)
+    
+    for i in batch_iterator:
+        batch_chunks = chunks[i:i + max_batch_size]
+        
        try:
            response = client.embeddings.create(model=model_name, input=batch_chunks)
            batch_embeddings = [embedding.embedding for embedding in response.data]
@@ -210,42 +221,64 @@ def compute_embeddings_openai(chunks: List[str], model_name: str) -> np.ndarray:
    return embeddings


-def compute_embeddings_mlx(chunks: List[str], model_name: str) -> np.ndarray:
+def compute_embeddings_mlx(chunks: List[str], model_name: str, batch_size: int = 16) -> np.ndarray:
    """Computes embeddings using an MLX model."""
    try:
        import mlx.core as mx
        from mlx_lm.utils import load
+        from tqdm import tqdm
    except ImportError as e:
        raise RuntimeError(
            "MLX or related libraries not available. Install with: uv pip install mlx mlx-lm"
        ) from e

    print(
-        f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}'..."
+        f"INFO: Computing embeddings for {len(chunks)} chunks using MLX model '{model_name}' with batch_size={batch_size}..."
    )

    # Load model and tokenizer
    model, tokenizer = load(model_name)

-    # Process each chunk
+    # Process chunks in batches with progress bar
    all_embeddings = []
-    for chunk in chunks:
-        # Tokenize
-        token_ids = tokenizer.encode(chunk)  # type: ignore
+    
+    try:
+        from tqdm import tqdm
+        batch_iterator = tqdm(range(0, len(chunks), batch_size), desc="Computing embeddings", unit="batch")
+    except ImportError:
+        batch_iterator = range(0, len(chunks), batch_size)
+    
+    for i in batch_iterator:
+        batch_chunks = chunks[i:i + batch_size]
+        
+        # Tokenize all chunks in the batch
+        batch_token_ids = []
+        for chunk in batch_chunks:
+            token_ids = tokenizer.encode(chunk)  # type: ignore
+            batch_token_ids.append(token_ids)
+        
+        # Pad sequences to the same length for batch processing
+        max_length = max(len(ids) for ids in batch_token_ids)
+        padded_token_ids = []
+        for token_ids in batch_token_ids:
+            # Pad with tokenizer.pad_token_id or 0
+            padded = token_ids + [0] * (max_length - len(token_ids))
+            padded_token_ids.append(padded)
+        
+        # Convert to MLX array with batch dimension
+        input_ids = mx.array(padded_token_ids)

-        # Convert to MLX array and add batch dimension
-        input_ids = mx.array([token_ids])
-
-        # Get embeddings
+        # Get embeddings for the batch
        embeddings = model(input_ids)

-        # Mean pooling (since we only have one sequence, just take the mean)
-        pooled = embeddings.mean(axis=1)  # Shape: (1, hidden_size)
+        # Mean pooling for each sequence in the batch
+        pooled = embeddings.mean(axis=1)  # Shape: (batch_size, hidden_size)

-        # Convert individual embedding to numpy via list (to handle bfloat16)
-        pooled_list = pooled[0].tolist()  # Remove batch dimension and convert to list
-        pooled_numpy = np.array(pooled_list, dtype=np.float32)
-        all_embeddings.append(pooled_numpy)
+        # Convert batch embeddings to numpy
+        for j in range(len(batch_chunks)):
+            pooled_list = pooled[j].tolist()  # Convert to list
+            pooled_numpy = np.array(pooled_list, dtype=np.float32)
+            all_embeddings.append(pooled_numpy)

    # Stack numpy arrays
    return np.stack(all_embeddings)
@@ -311,6 +344,8 @@ class LeannBuilder:
        self.dimensions = dimensions
        self.embedding_mode = embedding_mode
        self.backend_kwargs = backend_kwargs
+        if 'mlx' in self.embedding_model:
+            self.embedding_mode = "mlx"
        self.chunks: List[Dict[str, Any]] = []

    def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
@@ -340,7 +375,13 @@ class LeannBuilder:
        offset_file = index_dir / f"{index_name}.passages.idx"
        offset_map = {}
        with open(passages_file, "w", encoding="utf-8") as f:
-            for chunk in self.chunks:
+            try:
+                from tqdm import tqdm
+                chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk")
+            except ImportError:
+                chunk_iterator = self.chunks
+            
+            for chunk in chunk_iterator:
                offset = f.tell()
                json.dump(
                    {
@@ -175,7 +175,7 @@ class EmbeddingServerManager:
        self.backend_module_name = backend_module_name
        self.server_process: Optional[subprocess.Popen] = None
        self.server_port: Optional[int] = None
-        # atexit.register(self.stop_server)
+        atexit.register(self.stop_server)

    def start_server(self, port: int, model_name: str, embedding_mode: str = "sentence-transformers", **kwargs) -> bool:
        """
@@ -23,7 +23,7 @@ g++ ./demo_reader.cpp -o ./demo_reader && ./demo_reader --stats \
  f.read(reinterpret_cast<char *>(&val), sizeof(uint32_t))
 #define SECTOR_SIZE 4096

-// 辅助：获取文件大小
+// Helper: Get file size
 static size_t get_file_size(const std::string &fname) {
  std::ifstream ifs(fname, std::ios::binary | std::ios::ate);
  if (ifs.fail() || !ifs.is_open()) {
@@ -32,7 +32,7 @@ static size_t get_file_size(const std::string &fname) {
  return static_cast<size_t>(ifs.tellg());
 }

-// 打印 sector 的前若干 hex，用于debug
+// Print first few hex of sector for debug
 static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
  size_t show_len = (len < max_len) ? len : max_len;
  for (size_t i = 0; i < show_len; i++) {
@@ -46,19 +46,19 @@ static void print_hex(const char *buf, size_t len, size_t max_len = 64) {
 }

 /*
-  修正后的 demo_reader:
-  1) 从 partition.bin 读:
+  Corrected demo_reader:
+  1) Read from partition.bin:
      - C, partition_nums, nd
-      - graph_partitions[i]: 分区 i 的所有 nodeID
+      - graph_partitions[i]: all nodeIDs in partition i
      - id2partition[nodeID]: nodeID => partition i
-  2) 从 _disk_graph.index 读:
-      a) sector0 里先有 2个 int: meta_n, meta_dim
-      b) 再有 meta_n个 uint64_t
-         例如: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
-  [8]=file_size... 具体位置要结合 relayout 的写法 c) graph_node_len =
-  max_node_len - dim_in_meta*sizeof(float) 3) 用户给定 target_node_id =>
+  2) Read from _disk_graph.index:
+      a) sector0 first has 2 ints: meta_n, meta_dim
+      b) then meta_n uint64_t
+         e.g.: [0]=nd, [1]=dim, [2]=??, [3]=max_node_len, [4]=C, [5]..??,
+  [8]=file_size... specific positions need to be combined with relayout writing c) graph_node_len =
+  max_node_len - dim_in_meta*sizeof(float) 3) User given target_node_id =>
      partition_id= id2partition[node_id]
-      在 graph_partitions[partition_id] 里找 node 的下标 j
+      find node index j in graph_partitions[partition_id]
      offset = (partition_id+1)*4096 => sector
      adjacency_offset= j*graph_node_len => neighbor_count => neighbors
 */
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
              << "\n";
  }

-  // 1) 读取 partition.bin
+  // 1) Read partition.bin
  std::ifstream pf(partition_bin, std::ios::binary);
  if (!pf.is_open()) {
    std::cerr << "Cannot open partition.bin: " << partition_bin << std::endl;
@@ -119,8 +119,8 @@ int main(int argc, char **argv) {
            << ", partition_nums=" << partition_nums << ", nd=" << nd
            << std::endl;

-  // 读取分区节点列表
-  std::vector<std::vector<uint32_t>> graph_partitions(partition_nums);
+  // Read partition node lists
+  std::vector<std::vector<uint32_t> > graph_partitions(partition_nums);
  for (uint64_t i = 0; i < partition_nums; i++) {
    uint32_t psize;
    READ_U32(pf, psize);
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
    pf.read(reinterpret_cast<char *>(graph_partitions[i].data()),
            psize * sizeof(uint32_t));
  }
-  // 读取 _id2partition[node], 大小= nd
+  // Read _id2partition[node], size= nd
  std::vector<uint32_t> id2partition(nd);
  pf.read(reinterpret_cast<char *>(id2partition.data()), nd * sizeof(uint32_t));
  pf.close();
@@ -140,23 +140,23 @@ int main(int argc, char **argv) {
    return 1;
  }

-  // 2) 解析 _disk_graph.index
+  // 2) Parse _disk_graph.index
  std::ifstream gf(graph_index, std::ios::binary);
  if (!gf.is_open()) {
    std::cerr << "Cannot open disk_graph.index: " << graph_index << std::endl;
    return 1;
  }
-  // (a) sector0 => 先读 2个 int
+  // (a) sector0 => first read 2 ints
  int meta_n, meta_dim;
  gf.read((char *)&meta_n, sizeof(int));
  gf.read((char *)&meta_dim, sizeof(int));
  std::cout << "[debug] meta_n=" << meta_n << ", meta_dim=" << meta_dim << "\n";

-  // (b) 读 meta_n个 uint64_t
+  // (b) Read meta_n uint64_t
  std::vector<uint64_t> meta_info(meta_n);
  gf.read(reinterpret_cast<char *>(meta_info.data()),
          meta_n * sizeof(uint64_t));
-  // 打印
+  // Print
  for (int i = 0; i < meta_n; i++) {
    std::cout << " meta_info[" << i << "]= " << meta_info[i] << "\n";
  }
@@ -164,11 +164,11 @@ int main(int argc, char **argv) {
  size_t file_size = get_file_size(graph_index);
  std::cout << "[disk_graph.index size] " << file_size << " bytes\n";

-  // **根据 relayout log** 你说: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
+  // **According to relayout log** you said: meta_info[0]=nd=60450220, meta_info[1]=dim=769,
  //    meta_info[2]=??(16495248?), meta_info[3]=max_node_len=3320,
  //    meta_info[4]=16 (C),
-  //    meta_info[8]= 15475261440(文件大小)
-  // 我们这里先手动解析:
+  //    meta_info[8]= 15475261440(file size)
+  // We manually parse here first:
  uint64_t nd_in_meta = meta_info[0];
  uint64_t dim_in_meta = meta_info[1];
  uint64_t max_node_len = meta_info[3];
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
            << ", c_in_meta= " << c_in_meta
            << ", entire_file_size= " << entire_file_sz << "\n";

-  // 计算 graph_node_len
+  // Calculate graph_node_len
  uint64_t dim_size = dim_in_meta * sizeof(float);
  uint64_t graph_node_len = max_node_len - dim_size;
  std::cout << " => graph_node_len= " << graph_node_len << "\n\n";
@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
      // Error check pf_again if needed
    }

-    // 3) 找 target_node_id => partition_id => subIndex
+    // 3) Find target_node_id => partition_id => subIndex
    uint32_t partition_id = id2partition[target_node_id];
    if (partition_id >= partition_nums) {
      std::cerr << "Partition ID out-of-range for target node.\n";
@@ -264,7 +264,7 @@ def run_mlx_benchmark():
        }
    
    config = BenchmarkConfig(
-        model_path="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ",
+        model_path="mlx-community/all-MiniLM-L6-v2-4bit",
        use_mlx=True
    )