Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -0,0 +1,12 @@
+# 最终简化版
+cmake_minimum_required(VERSION 3.24)
+project(leann_backend_hnsw_wrapper)
+
+set(FAISS_ENABLE_PYTHON ON CACHE BOOL "" FORCE)
+set(FAISS_ENABLE_GPU OFF CACHE BOOL "" FORCE)
+set(FAISS_ENABLE_EXTRAS OFF CACHE BOOL "" FORCE)
+set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
+set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE)
+set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
+
+add_subdirectory(third_party/faiss)
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/init.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/init.py
@@ -0,0 +1 @@
+from . import hnsw_backend
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
@@ -0,0 +1,313 @@
+import numpy as np
+import os
+import json
+import struct
+from pathlib import Path
+from typing import Dict
+import contextlib
+import threading
+import time
+import atexit
+import socket
+import subprocess
+import sys
+
+# 文件: packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
+
+# ... (其他 import 保持不变) ...
+
+from leann.registry import register_backend
+from leann.interface import (
+    LeannBackendFactoryInterface,
+    LeannBackendBuilderInterface,
+    LeannBackendSearcherInterface
+)
+
+def get_metric_map():
+    from . import faiss
+    return {
+        "mips": faiss.METRIC_INNER_PRODUCT,
+        "l2": faiss.METRIC_L2,
+    "cosine": faiss.METRIC_INNER_PRODUCT,  # Will need normalization
+    }
+
+def _check_port(port: int) -> bool:
+    """Check if a port is in use"""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(('localhost', port)) == 0
+
+class HNSWEmbeddingServerManager:
+    """
+    HNSW-specific embedding server manager that handles the lifecycle of the embedding server process.
+    Mirrors the DiskANN EmbeddingServerManager architecture.
+    """
+    def __init__(self):
+        self.server_process = None
+        self.server_port = None
+        atexit.register(self.stop_server)
+
+    def start_server(self, port=5556, model_name="sentence-transformers/all-mpnet-base-v2", passages_file=None):
+        """
+        Start the HNSW embedding server process.
+        
+        Args:
+            port: ZMQ port for the server
+            model_name: Name of the embedding model to use
+            passages_file: Optional path to passages JSON file
+        """
+        if self.server_process and self.server_process.poll() is None:
+            print(f"INFO: Reusing existing HNSW server process for this session (PID {self.server_process.pid})")
+            return True
+            
+        # Check if port is already in use
+        if _check_port(port):
+            print(f"WARNING: Port {port} is already in use. Assuming an external HNSW server is running and connecting to it.")
+            return True
+        
+        print(f"INFO: Starting session-level HNSW embedding server as a background process...")
+        
+        try:
+            command = [
+                sys.executable,
+                "-m", "packages.leann-backend-hnsw.src.leann_backend_hnsw.hnsw_embedding_server",
+                "--zmq-port", str(port), 
+                "--model-name", model_name
+            ]
+            
+            # Add passages file if provided
+            if passages_file:
+                command.extend(["--passages-file", str(passages_file)])
+            
+            project_root = Path(__file__).parent.parent.parent.parent
+            print(f"INFO: Running HNSW command from project root: {project_root}")
+            
+            self.server_process = subprocess.Popen(
+                command,
+                cwd=project_root,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                encoding='utf-8'
+            )
+            self.server_port = port
+            print(f"INFO: HNSW server process started with PID: {self.server_process.pid}")
+
+            max_wait, wait_interval = 30, 0.5
+            for _ in range(int(max_wait / wait_interval)):
+                if _check_port(port):
+                    print(f"✅ HNSW embedding server is up and ready for this session.")
+                    log_thread = threading.Thread(target=self._log_monitor, daemon=True)
+                    log_thread.start()
+                    return True
+                if self.server_process.poll() is not None:
+                    print("❌ ERROR: HNSW server process terminated unexpectedly during startup.")
+                    self._log_monitor()
+                    return False
+                time.sleep(wait_interval)
+            
+            print(f"❌ ERROR: HNSW server process failed to start listening within {max_wait} seconds.")
+            self.stop_server()
+            return False
+                
+        except Exception as e:
+            print(f"❌ ERROR: Failed to start HNSW embedding server process: {e}")
+            return False
+
+    def _log_monitor(self):
+        """Monitor server logs"""
+        if not self.server_process:
+            return
+        try:
+            if self.server_process.stdout:
+                for line in iter(self.server_process.stdout.readline, ''):
+                    print(f"[HNSWEmbeddingServer LOG]: {line.strip()}")
+                self.server_process.stdout.close()
+            if self.server_process.stderr:
+                for line in iter(self.server_process.stderr.readline, ''):
+                    print(f"[HNSWEmbeddingServer ERROR]: {line.strip()}")
+                self.server_process.stderr.close()
+        except Exception as e:
+            print(f"HNSW Log monitor error: {e}")
+
+    def stop_server(self):
+        """Stop the HNSW embedding server process"""
+        if self.server_process and self.server_process.poll() is None:
+            print(f"INFO: Terminating HNSW session server process (PID: {self.server_process.pid})...")
+            self.server_process.terminate()
+            try:
+                self.server_process.wait(timeout=5)
+                print("INFO: HNSW server process terminated.")
+            except subprocess.TimeoutExpired:
+                print("WARNING: HNSW server process did not terminate gracefully, killing it.")
+                self.server_process.kill()
+        self.server_process = None
+
+@register_backend("hnsw")
+class HNSWBackend(LeannBackendFactoryInterface):
+    @staticmethod
+    def builder(**kwargs) -> LeannBackendBuilderInterface:
+        return HNSWBuilder(**kwargs)
+
+    @staticmethod
+    def searcher(index_path: str, **kwargs) -> LeannBackendSearcherInterface:
+        path = Path(index_path)
+        meta_path = path.parent / f"{path.stem}.hnsw.meta.json"
+        if not meta_path.exists():
+             raise FileNotFoundError(f"Leann metadata file not found at {meta_path}. Cannot infer vector dimension for searcher.")
+        
+        with open(meta_path, 'r') as f:
+            meta = json.load(f)
+
+        try:
+            from sentence_transformers import SentenceTransformer
+            model = SentenceTransformer(meta.get("embedding_model"))
+            dimensions = model.get_sentence_embedding_dimension()
+            kwargs['dimensions'] = dimensions
+        except ImportError:
+            raise ImportError("sentence-transformers is required to infer embedding dimensions. Please install it.")
+        except Exception as e:
+            raise RuntimeError(f"Could not load SentenceTransformer model to get dimension: {e}")
+
+        return HNSWSearcher(index_path, **kwargs)
+
+class HNSWBuilder(LeannBackendBuilderInterface):
+    def __init__(self, **kwargs):
+        self.build_params = kwargs
+
+    def build(self, data: np.ndarray, index_path: str, **kwargs):
+        """Build HNSW index using FAISS"""
+        from . import faiss
+        
+        path = Path(index_path)
+        index_dir = path.parent
+        index_prefix = path.stem
+
+        index_dir.mkdir(parents=True, exist_ok=True)
+
+        if data.dtype != np.float32:
+            data = data.astype(np.float32)
+        if not data.flags['C_CONTIGUOUS']:
+            data = np.ascontiguousarray(data)
+            
+        build_kwargs = {**self.build_params, **kwargs}
+        metric_str = build_kwargs.get("distance_metric", "mips").lower()
+        metric_enum = get_metric_map().get(metric_str)
+        if metric_enum is None:
+            raise ValueError(f"Unsupported distance_metric '{metric_str}'.")
+
+        # HNSW parameters
+        M = build_kwargs.get("M", 32)  # Max connections per layer
+        efConstruction = build_kwargs.get("efConstruction", 200)  # Size of the dynamic candidate list for construction
+        dim = data.shape[1]
+
+        print(f"INFO: Building HNSW index for {data.shape[0]} vectors with metric {metric_enum}...")
+        
+        try:
+            # Create HNSW index
+            if metric_enum == faiss.METRIC_INNER_PRODUCT:
+                index = faiss.IndexHNSWFlat(dim, M, metric_enum)
+            else:  # L2
+                index = faiss.IndexHNSWFlat(dim, M, metric_enum)
+            
+            # Set construction parameters
+            index.hnsw.efConstruction = efConstruction
+            
+            # Normalize vectors if using cosine similarity
+            if metric_str == "cosine":
+                faiss.normalize_L2(data)
+            
+            # Add vectors to index
+            index.add(data.shape[0], faiss.swig_ptr(data))
+            
+            # Save index
+            index_file = index_dir / f"{index_prefix}.index"
+            faiss.write_index(index, str(index_file))
+            
+            print(f"✅ HNSW index built successfully at '{index_file}'")
+            
+        except Exception as e:
+            print(f"💥 ERROR: HNSW index build failed. Exception: {e}")
+            raise
+
+class HNSWSearcher(LeannBackendSearcherInterface):
+    def __init__(self, index_path: str, **kwargs):
+        from . import faiss
+        path = Path(index_path)
+        index_dir = path.parent
+        index_prefix = path.stem
+        
+        metric_str = kwargs.get("distance_metric", "mips").lower()
+        metric_enum = get_metric_map().get(metric_str)
+        if metric_enum is None:
+            raise ValueError(f"Unsupported distance_metric '{metric_str}'.")
+        
+        dimensions = kwargs.get("dimensions")
+        if not dimensions:
+            raise ValueError("Vector dimension not provided to HNSWSearcher.")
+        
+        try:
+            # Load FAISS HNSW index
+            index_file = index_dir / f"{index_prefix}.index"
+            if not index_file.exists():
+                raise FileNotFoundError(f"HNSW index file not found at {index_file}")
+            
+            self._index = faiss.read_index(str(index_file))
+            self.metric_str = metric_str
+            self.embedding_server_manager = HNSWEmbeddingServerManager()
+            print("✅ HNSW index loaded successfully.")
+            
+        except Exception as e:
+            print(f"💥 ERROR: Failed to load HNSW index. Exception: {e}")
+            raise
+
+    def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
+        """Search using HNSW index with optional recompute functionality"""
+        ef = kwargs.get("ef", 200)  # Size of the dynamic candidate list for search
+        
+        # Recompute parameters
+        recompute_neighbor_embeddings = kwargs.get("recompute_neighbor_embeddings", False)
+        zmq_port = kwargs.get("zmq_port", 5556)
+        embedding_model = kwargs.get("embedding_model", "sentence-transformers/all-mpnet-base-v2")
+        passages_file = kwargs.get("passages_file", None)
+        
+        if recompute_neighbor_embeddings:
+            print(f"INFO: HNSW ZMQ mode enabled - ensuring embedding server is running")
+            
+            if not self.embedding_server_manager.start_server(zmq_port, embedding_model, passages_file):
+                print(f"WARNING: Failed to start HNSW embedding server, falling back to standard search")
+                kwargs['recompute_neighbor_embeddings'] = False
+        
+        if query.dtype != np.float32:
+            query = query.astype(np.float32)
+        if query.ndim == 1:
+            query = np.expand_dims(query, axis=0)
+            
+        # Normalize query if using cosine similarity
+        if self.metric_str == "cosine":
+            faiss.normalize_L2(query)
+        
+        try:
+            # Set search parameter
+            self._index.hnsw.efSearch = ef
+            
+            if recompute_neighbor_embeddings:
+                # Use custom search with recompute
+                # This would require implementing custom HNSW search logic
+                # For now, we'll fall back to standard search
+                print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
+                distances, labels = self._index.search(query, top_k)
+            else:
+                # Standard FAISS search
+                distances, labels = self._index.search(query, top_k)
+            
+            return {"labels": labels, "distances": distances}
+            
+        except Exception as e:
+            print(f"💥 ERROR: HNSW search failed. Exception: {e}")
+            batch_size = query.shape[0]
+            return {"labels": np.full((batch_size, top_k), -1, dtype=np.int64), 
+                   "distances": np.full((batch_size, top_k), float('inf'), dtype=np.float32)}
+    
+    def __del__(self):
+        if hasattr(self, 'embedding_server_manager'):
+            self.embedding_server_manager.stop_server()
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
@@ -0,0 +1,583 @@
+#!/usr/bin/env python3
+"""
+HNSW-specific embedding server with removed config.py dependencies
+Based on DiskANN embedding server architecture
+"""
+
+import pickle
+import argparse
+import threading
+import time
+from transformers import AutoTokenizer, AutoModel
+import os
+from contextlib import contextmanager
+import zmq
+import numpy as np
+import msgpack
+import json
+from pathlib import Path
+from typing import Dict, Any, Optional, Union
+
+RED = "\033[91m"
+RESET = "\033[0m"
+
+def is_similarity_metric():
+    """
+    Check if the metric type is similarity-based (like inner product).
+    0 = L2 (distance metric), 1 = Inner Product (similarity metric)
+    """
+    return True  # 1 is METRIC_INNER_PRODUCT in FAISS
+
+# Function for E5-style average pooling
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+
+def e5_average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+class SimplePassageLoader:
+    """
+    Simple passage loader that replaces config.py dependencies
+    """
+    def __init__(self, passages_data: Optional[Dict[str, Any]] = None):
+        self.passages_data = passages_data or {}
+    
+    def __getitem__(self, passage_id: Union[str, int]) -> Dict[str, str]:
+        """Get passage by ID"""
+        str_id = str(passage_id)
+        if str_id in self.passages_data:
+            return {"text": self.passages_data[str_id]}
+        else:
+            # Return empty text for missing passages
+            return {"text": ""}
+    
+    def __len__(self) -> int:
+        return len(self.passages_data)
+
+def load_passages_from_file(passages_file: str) -> SimplePassageLoader:
+    """
+    Load passages from a JSON file
+    Expected format: {"passage_id": "passage_text", ...}
+    """
+    if not os.path.exists(passages_file):
+        print(f"Warning: Passages file {passages_file} not found. Using empty loader.")
+        return SimplePassageLoader()
+    
+    try:
+        with open(passages_file, 'r', encoding='utf-8') as f:
+            passages_data = json.load(f)
+        print(f"Loaded {len(passages_data)} passages from {passages_file}")
+        return SimplePassageLoader(passages_data)
+    except Exception as e:
+        print(f"Error loading passages from {passages_file}: {e}")
+        return SimplePassageLoader()
+
+def create_hnsw_embedding_server(
+    passages_file: Optional[str] = None,
+    passages_data: Optional[Dict[str, str]] = None,
+    embeddings_file: Optional[str] = None,
+    use_fp16: bool = True,
+    use_int8: bool = False,
+    use_cuda_graphs: bool = False,
+    zmq_port: int = 5555,
+    max_batch_size: int = 128,
+    model_name: str = "sentence-transformers/all-mpnet-base-v2",
+    custom_max_length_param: Optional[int] = None,
+):
+    """
+    Create and start a ZMQ-based embedding server for HNSW backend.
+    
+    Args:
+        passages_file: Path to JSON file containing passage ID -> text mapping
+        passages_data: Direct passage data dict (alternative to passages_file)
+        embeddings_file: Path to pre-computed embeddings file (optional)
+        use_fp16: Whether to use FP16 precision
+        use_int8: Whether to use INT8 quantization
+        use_cuda_graphs: Whether to use CUDA graphs
+        zmq_port: ZMQ port to bind to
+        max_batch_size: Maximum batch size for processing
+        model_name: Transformer model name
+        custom_max_length_param: Custom max sequence length
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    
+    # Device setup
+    mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+    cuda_available = torch.cuda.is_available()
+    
+    print(f"MPS available: {mps_available}")
+    print(f"CUDA available: {cuda_available}")
+    
+    if cuda_available:
+        device = torch.device("cuda")
+        print("Using CUDA device")
+    elif mps_available:
+        device = torch.device("mps")
+        print("Using MPS device (Apple Silicon)")
+    else:
+        device = torch.device("cpu")
+        print("Using CPU device (no GPU acceleration available)")
+    
+    # Load model to the appropriate device
+    print(f"Starting HNSW server on port {zmq_port} with model {model_name}")
+    model = AutoModel.from_pretrained(model_name).to(device).eval()
+
+    # Check port availability
+    import socket
+    def check_port(port):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex(('localhost', port)) == 0
+
+    if check_port(zmq_port):
+        print(f"{RED}Port {zmq_port} is already in use{RESET}")
+        return
+
+    # Apply model optimizations (similar to DiskANN version)
+    if use_fp16 and (cuda_available or mps_available):
+        model = model.half()
+        model = torch.compile(model)
+        print(f"Using FP16 precision with model: {model_name}")
+    elif use_int8:
+        print("- Using TorchAO for Int8 dynamic activation and Int8 weight quantization")
+        from torchao.quantization import quantize_, Int8DynamicActivationInt8WeightConfig
+        quantize_(model, Int8DynamicActivationInt8WeightConfig())
+        model = torch.compile(model)
+        model.eval()
+        print("- Model successfully quantized and compiled")
+
+    # Load passages
+    if passages_data:
+        passages = SimplePassageLoader(passages_data)
+        print(f"Using provided passages data: {len(passages)} passages")
+    elif passages_file:
+        passages = load_passages_from_file(passages_file)
+    else:
+        passages = SimplePassageLoader()
+        print("No passages provided, using empty loader")
+
+    # Load embeddings if provided
+    _embeddings = None
+    if embeddings_file and os.path.exists(embeddings_file):
+        try:
+            with open(embeddings_file, "rb") as f:
+                _embeddings = pickle.load(f)
+            print(f"Loaded embeddings from {embeddings_file}")
+        except Exception as e:
+            print(f"Error loading embeddings: {e}")
+
+    class DeviceTimer:
+        """Device event-based timer for accurate timing."""
+        def __init__(self, name="", device=device):
+            self.name = name
+            self.device = device
+            self.start_time = 0
+            self.end_time = 0
+            
+            if cuda_available:
+                self.start_event = torch.cuda.Event(enable_timing=True)
+                self.end_event = torch.cuda.Event(enable_timing=True)
+            else:
+                self.start_event = None
+                self.end_event = None
+
+        @contextmanager
+        def timing(self):
+            self.start()
+            yield
+            self.end()
+
+        def start(self):
+            if cuda_available:
+                torch.cuda.synchronize()
+                self.start_event.record()
+            else:
+                if self.device.type == "mps":
+                    torch.mps.synchronize()
+                self.start_time = time.time()
+
+        def end(self):
+            if cuda_available:
+                self.end_event.record()
+                torch.cuda.synchronize()
+            else:
+                if self.device.type == "mps":
+                    torch.mps.synchronize()
+                self.end_time = time.time()
+
+        def elapsed_time(self):
+            if cuda_available:
+                return self.start_event.elapsed_time(self.end_event) / 1000.0
+            else:
+                return self.end_time - self.start_time
+
+        def print_elapsed(self):
+            return  # Disabled for now
+
+    def process_batch(texts_batch, ids_batch, missing_ids):
+        """Process a batch of texts and return embeddings"""
+        _is_e5_model = "e5" in model_name.lower()
+        batch_size = len(texts_batch)
+        
+        # E5 model preprocessing
+        if _is_e5_model:
+            processed_texts_batch = [f"passage: {text}" for text in texts_batch]
+        else:
+            processed_texts_batch = texts_batch
+        
+        # Set max length
+        if _is_e5_model:
+            current_max_length = custom_max_length_param if custom_max_length_param is not None else 512
+        else:
+            current_max_length = custom_max_length_param if custom_max_length_param is not None else 256
+        
+        tokenize_timer = DeviceTimer("tokenization (batch)", device)
+        to_device_timer = DeviceTimer("transfer to device (batch)", device)
+        embed_timer = DeviceTimer("embedding (batch)", device)
+        pool_timer = DeviceTimer("pooling (batch)", device)
+        norm_timer = DeviceTimer("normalization (batch)", device)
+        
+        with tokenize_timer.timing():
+            encoded_batch = tokenizer(
+                processed_texts_batch,
+                padding="max_length",
+                truncation=True,
+                max_length=current_max_length,
+                return_tensors="pt",
+                return_token_type_ids=False,
+            )
+        
+        seq_length = encoded_batch["input_ids"].size(1)
+        
+        with to_device_timer.timing():
+            enc = {k: v.to(device) for k, v in encoded_batch.items()}
+        
+        with torch.no_grad():
+            with embed_timer.timing():
+                out = model(enc["input_ids"], enc["attention_mask"])
+            
+            with pool_timer.timing():
+                if not hasattr(out, 'last_hidden_state'):
+                    if isinstance(out, torch.Tensor) and len(out.shape) == 2:
+                        pooled_embeddings = out
+                    else:
+                        print(f"{RED}ERROR: Cannot determine how to pool. Output shape: {out.shape if isinstance(out, torch.Tensor) else 'N/A'}{RESET}")
+                        hidden_dim = getattr(model.config, 'hidden_size', 384 if _is_e5_model else 768)
+                        pooled_embeddings = torch.zeros((batch_size, hidden_dim), device=device, dtype=enc["input_ids"].dtype if hasattr(enc["input_ids"], "dtype") else torch.float32)
+                elif _is_e5_model:
+                    pooled_embeddings = e5_average_pool(out.last_hidden_state, enc['attention_mask'])
+                else:
+                    hidden_states = out.last_hidden_state
+                    mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
+                    sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
+                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
+                    pooled_embeddings = sum_embeddings / sum_mask
+            
+            final_embeddings = pooled_embeddings
+            if _is_e5_model:
+                with norm_timer.timing():
+                    final_embeddings = F.normalize(pooled_embeddings, p=2, dim=1)
+        
+        if torch.isnan(final_embeddings).any() or torch.isinf(final_embeddings).any():
+            print(f"{RED}!!! In process_batch: NaN or Inf detected in final_embeddings! "
+                  f"Model: {model_name}, E5: {_is_e5_model}. IDs (sample): {ids_batch[:5]}...{RESET}")
+            dim_size = final_embeddings.shape[-1]
+            error_output = torch.zeros((batch_size, dim_size), device='cpu', dtype=torch.float32).numpy()
+            print(f"{RED}Returning zero embeddings of shape ({batch_size}, {dim_size}) due to NaN/Inf.{RESET}")
+            return error_output
+        
+        return final_embeddings.cpu().numpy()
+
+    def client_warmup(zmq_port):
+        """Perform client-side warmup"""
+        time.sleep(2)
+        print(f"Performing client-side warmup with model {model_name}...")
+        sample_ids = ["1", "2", "3", "4", "5"]
+        
+        try:
+            context = zmq.Context()
+            socket = context.socket(zmq.REQ)
+            socket.connect(f"tcp://localhost:{zmq_port}")
+            socket.setsockopt(zmq.RCVTIMEO, 30000)
+            socket.setsockopt(zmq.SNDTIMEO, 30000)
+
+            try: 
+                ids_to_send = [int(x) for x in sample_ids]
+            except ValueError: 
+                ids_to_send = []
+
+            if not ids_to_send: 
+                print("Skipping warmup send.")
+                return
+
+            request_payload = [ids_to_send]
+            request_bytes = msgpack.packb(request_payload)
+
+            for i in range(3):
+                print(f"Sending warmup request {i+1}/3 via ZMQ (MessagePack)...")
+                socket.send(request_bytes)
+                response_bytes = socket.recv()
+
+                response_payload = msgpack.unpackb(response_bytes)
+                dimensions = response_payload[0]
+                embeddings_count = dimensions[0] if dimensions and len(dimensions) > 0 else 0
+                print(f"Warmup request {i+1}/3 successful, received {embeddings_count} embeddings")
+                time.sleep(0.1)
+
+            print("Client-side MessagePack ZMQ warmup complete")
+            socket.close()
+            context.term()
+        except Exception as e:
+            print(f"Error during MessagePack ZMQ warmup: {e}")
+
+    def zmq_server_thread():
+        """ZMQ server thread"""
+        context = zmq.Context()
+        socket = context.socket(zmq.REP)
+        socket.bind(f"tcp://*:{zmq_port}")
+        print(f"HNSW ZMQ server listening on port {zmq_port}")
+
+        socket.setsockopt(zmq.RCVTIMEO, 300000)
+        socket.setsockopt(zmq.SNDTIMEO, 300000)
+
+        while True:
+            try:
+                message_bytes = socket.recv()
+                print(f"Received ZMQ request of size {len(message_bytes)} bytes")
+
+                e2e_start = time.time()
+                lookup_timer = DeviceTimer("text lookup", device)
+
+                try:
+                    request_payload = msgpack.unpackb(message_bytes)
+                    
+                    # Handle distance calculation requests
+                    if isinstance(request_payload, list) and len(request_payload) == 2 and isinstance(request_payload[0], list) and isinstance(request_payload[1], list):
+                        node_ids = request_payload[0]
+                        query_vector = np.array(request_payload[1], dtype=np.float32)
+                        
+                        print(f"Request for distance calculation: {len(node_ids)} nodes, query vector dim: {len(query_vector)}")
+                        
+                        # Get embeddings for node IDs
+                        texts = []
+                        missing_ids = []
+                        with lookup_timer.timing():
+                            for nid in node_ids:
+                                txtinfo = passages[nid]
+                                if txtinfo is None or txtinfo["text"] == "":
+                                    print(f"Warning: Passage with ID {nid} not found")
+                                    missing_ids.append(nid)
+                                    txt = ""
+                                else:
+                                    txt = txtinfo["text"]
+                                texts.append(txt)
+                        lookup_timer.print_elapsed()
+                        
+                        # Process embeddings in chunks if needed
+                        all_node_embeddings = []
+                        total_size = len(texts)
+                        
+                        if total_size > max_batch_size:
+                            for i in range(0, total_size, max_batch_size):
+                                end_idx = min(i + max_batch_size, total_size)
+                                chunk_texts = texts[i:end_idx]
+                                chunk_ids = node_ids[i:end_idx]
+                                
+                                embeddings_chunk = process_batch(chunk_texts, chunk_ids, missing_ids)
+                                all_node_embeddings.append(embeddings_chunk)
+                                
+                                if cuda_available:
+                                    torch.cuda.empty_cache()
+                                elif device.type == "mps":
+                                    torch.mps.empty_cache()
+                            
+                            node_embeddings = np.vstack(all_node_embeddings)
+                        else:
+                            node_embeddings = process_batch(texts, node_ids, missing_ids)
+                        
+                        # Calculate distances
+                        query_tensor = torch.tensor(query_vector, device=device).float()
+                        node_embeddings_tensor = torch.tensor(node_embeddings, device=device).float()
+                        
+                        calc_timer = DeviceTimer("distance calculation", device)
+                        with calc_timer.timing():
+                            with torch.no_grad():
+                                if is_similarity_metric():
+                                    node_embeddings_np = node_embeddings_tensor.cpu().numpy()
+                                    query_np = query_tensor.cpu().numpy()
+                                    distances = -np.dot(node_embeddings_np, query_np)
+                                else:
+                                    node_embeddings_np = node_embeddings_tensor.cpu().numpy().astype(np.float32)
+                                    query_np = query_tensor.cpu().numpy().astype(np.float32)
+                                    distances = np.sum(np.square(node_embeddings_np - query_np.reshape(1, -1)), axis=1)
+                        calc_timer.print_elapsed()
+                        
+                        try:
+                            response_payload = distances.flatten().tolist()
+                            response_bytes = msgpack.packb([response_payload], use_single_float=True)
+                            print(f"Sending distance response with {len(distances)} distances")
+                        except Exception as pack_error:
+                            print(f"Error packing MessagePack distance response: {pack_error}")
+                            response_bytes = msgpack.packb([[]])
+                        
+                        socket.send(response_bytes)
+                        
+                        if device.type == "cuda":
+                            torch.cuda.synchronize()
+                        elif device.type == "mps":
+                            torch.mps.synchronize()
+                        e2e_end = time.time()
+                        print(f"Distance calculation E2E time: {e2e_end - e2e_start:.6f} seconds")
+                        continue
+                    
+                    # Standard embedding request
+                    if not isinstance(request_payload, list) or len(request_payload) != 1 or not isinstance(request_payload[0], list):
+                        print(f"Error: Invalid MessagePack request format. Expected [[ids...]], got: {type(request_payload)}")
+                        socket.send(msgpack.packb([[], []]))
+                        continue
+                    
+                    node_ids = request_payload[0]
+                    print(f"Request for {len(node_ids)} node embeddings")
+                    
+                except Exception as unpack_error:
+                    print(f"Error unpacking MessagePack request: {unpack_error}")
+                    socket.send(msgpack.packb([[], []]))
+                    continue
+
+                # Look up texts by node IDs
+                texts = []
+                missing_ids = []
+                with lookup_timer.timing():
+                    for nid in node_ids:
+                        txtinfo = passages[nid]
+                        if txtinfo is None or txtinfo["text"] == "":
+                            print(f"Warning: Passage with ID {nid} not found")
+                            missing_ids.append(nid)
+                            txt = ""
+                        else:
+                            txt = txtinfo["text"]
+                        texts.append(txt)
+                lookup_timer.print_elapsed()
+
+                if missing_ids:
+                    print(f"Missing passages for IDs: {missing_ids}")
+
+                # Process in chunks
+                total_size = len(texts)
+                print(f"Total batch size: {total_size}, max_batch_size: {max_batch_size}")
+                
+                all_embeddings = []
+                
+                if total_size > max_batch_size:
+                    print(f"Splitting batch of size {total_size} into chunks of {max_batch_size}")
+                    for i in range(0, total_size, max_batch_size):
+                        end_idx = min(i + max_batch_size, total_size)
+                        print(f"Processing chunk {i//max_batch_size + 1}/{(total_size + max_batch_size - 1)//max_batch_size}: items {i} to {end_idx-1}")
+                        
+                        chunk_texts = texts[i:end_idx]
+                        chunk_ids = node_ids[i:end_idx]
+                        
+                        embeddings_chunk = process_batch(chunk_texts, chunk_ids, missing_ids)
+                        all_embeddings.append(embeddings_chunk)
+                        
+                        if cuda_available:
+                            torch.cuda.empty_cache()
+                        elif device.type == "mps":
+                            torch.mps.empty_cache()
+                            
+                    hidden = np.vstack(all_embeddings)
+                    print(f"Combined embeddings shape: {hidden.shape}")
+                else:
+                    hidden = process_batch(texts, node_ids, missing_ids)
+
+                # Serialization and response
+                ser_start = time.time()
+
+                print(f"DEBUG zmq_server_thread: Final 'hidden' array | Shape: {hidden.shape} | Dtype: {hidden.dtype} | Has NaN/Inf: {np.isnan(hidden).any() or np.isinf(hidden).any()}")
+                if np.isnan(hidden).any() or np.isinf(hidden).any():
+                    print(f"{RED}!!! ERROR: NaN or Inf detected in final 'hidden' numpy array BEFORE sending! "
+                          f"Requested IDs (sample): {node_ids[:5]}...{RESET}")
+                    assert False
+
+                try:
+                    hidden_contiguous_f32 = np.ascontiguousarray(hidden, dtype=np.float32)
+                    response_payload = [
+                        list(hidden_contiguous_f32.shape),
+                        hidden_contiguous_f32.flatten().tolist()
+                    ]
+                    response_bytes = msgpack.packb(response_payload, use_single_float=True)
+                except Exception as pack_error:
+                     print(f"Error packing MessagePack response: {pack_error}")
+                     response_bytes = msgpack.packb([[], []])
+
+                socket.send(response_bytes)
+                ser_end = time.time()
+
+                print(f"Serialize time: {ser_end - ser_start:.6f} seconds")
+
+                if device.type == "cuda":
+                    torch.cuda.synchronize()
+                elif device.type == "mps":
+                    torch.mps.synchronize()
+                e2e_end = time.time()
+                print(f"ZMQ E2E time: {e2e_end - e2e_start:.6f} seconds")
+
+            except zmq.Again:
+                print("ZMQ socket timeout, continuing to listen")
+                continue
+            except Exception as e:
+                print(f"Error in ZMQ server loop: {e}")
+                import traceback
+                traceback.print_exc()
+                try: 
+                    socket.send(msgpack.packb([[], []]))
+                except:
+                    pass
+
+    # Start warmup and server threads
+    if len(passages) > 0:
+        warmup_thread = threading.Thread(target=client_warmup, args=(zmq_port,))
+        warmup_thread.daemon = True
+        warmup_thread.start()
+
+    zmq_thread = threading.Thread(target=zmq_server_thread, daemon=True)
+    zmq_thread.start()
+    print(f"Started HNSW ZMQ server thread on port {zmq_port}")
+    
+    # Keep the main thread alive
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("HNSW Server shutting down...")
+        return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HNSW Embedding service")
+    parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
+    parser.add_argument("--passages-file", type=str, help="JSON file containing passage ID to text mapping")
+    parser.add_argument("--embeddings-file", type=str, help="Pickle file containing pre-computed embeddings")
+    parser.add_argument("--use-fp16", action="store_true", default=False)
+    parser.add_argument("--use-int8", action="store_true", default=False)
+    parser.add_argument("--use-cuda-graphs", action="store_true", default=False)
+    parser.add_argument("--max-batch-size", type=int, default=128, help="Maximum batch size before splitting")
+    parser.add_argument("--model-name", type=str, default="sentence-transformers/all-mpnet-base-v2", 
+                        help="Embedding model name")
+    parser.add_argument("--custom-max-length", type=int, default=None, help="Override model's default max sequence length")
+    
+    args = parser.parse_args()
+
+    # Create and start the HNSW embedding server
+    create_hnsw_embedding_server(
+        passages_file=args.passages_file,
+        embeddings_file=args.embeddings_file,
+        use_fp16=args.use_fp16,
+        use_int8=args.use_int8,
+        use_cuda_graphs=args.use_cuda_graphs,
+        zmq_port=args.zmq_port,
+        max_batch_size=args.max_batch_size,
+        model_name=args.model_name,
+        custom_max_length_param=args.custom_max_length,
+    )
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -0,0 +1,18 @@
+# 文件: packages/leann-backend-hnsw/pyproject.toml
+
+[build-system]
+requires = ["scikit-build-core>=0.10", "numpy", "swig"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "leann-backend-hnsw"
+version = "0.1.0"
+description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
+dependencies = ["leann-core==0.1.0", "numpy"]
+
+# 回归到最标准的 scikit-build-core 配置
+[tool.scikit-build]
+wheel.packages = ["leann_backend_hnsw"]
+editable.mode = "redirect"
+cmake.build-type = "Debug"
+build.verbose = true
--- a/packages/leann-backend-hnsw/third_party/faiss/.clang-format
+++ b/packages/leann-backend-hnsw/third_party/faiss/.clang-format
@@ -0,0 +1,88 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false  # at some point, set this to true
+BinPackParameters: false # at some point, set this to true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 2000000
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
--- a/packages/leann-backend-hnsw/third_party/faiss/.dockerignore
+++ b/packages/leann-backend-hnsw/third_party/faiss/.dockerignore
@@ -0,0 +1 @@
+sift1M
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/ISSUE_TEMPLATE.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,33 @@
+# Summary
+
+<!-- Facebook has a bounty program for the safe disclosure of security bugs. In
+those cases, please go through the process outlined on that page and do not
+file a public issue. -->
+
+# Platform
+
+<!-- if the question/problem is not platform-specific, please ignore this -->
+
+OS: <!-- e.g. macOS 10.13.3 -->
+
+Faiss version: <!-- git commit, e.g. 56383610bcb982d6591e2e2bea3516cb7723e04a -->
+
+Installed from: <!-- anaconda? compiled by yourself ? --> 
+
+Faiss compilation options: <!-- e.g. using MKL with compile flags ... -->
+
+Running on:
+- [ ] CPU
+- [ ] GPU
+
+Interface: 
+- [ ] C++
+- [ ] Python
+
+# Reproduction instructions
+
+<!-- Please provide specific and comprehensive instructions to reproduce the
+described behavior. -->
+
+<!-- Please *do not* post screenshots of logs. They are not searchable. Copy/paste 
+the text or make a gist if the text is too bulky. --> 
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_cmake/action.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_cmake/action.yml
@@ -0,0 +1,189 @@
+name: Build cmake
+inputs:
+  opt_level:
+    description: 'Compile options / optimization level.'
+    required: false
+    default: generic
+  gpu:
+    description: 'Enable GPU support.'
+    required: false
+    default: OFF
+  cuvs:
+    description: 'Enable cuVS support.'
+    required: false
+    default: OFF
+  rocm:
+    description: 'Enable ROCm support.'
+    required: false
+    default: OFF
+runs:
+  using: composite
+  steps:
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniforge-version: latest # ensures conda-forge channel is used.
+        channels: conda-forge
+        conda-remove-defaults: 'true'
+        # Set to aarch64 if we're on arm64 because there's no miniforge ARM64 package, just aarch64.
+        # They are the same thing, just named differently.
+        architecture: ${{ runner.arch  == 'ARM64' && 'aarch64' || runner.arch }}
+    - name: Configure build environment
+      shell: bash
+      run: |
+        # initialize Conda
+        conda config --set solver libmamba
+        # Ensure starting packages are from conda-forge.
+        conda list --show-channel-urls
+        conda update -y -q conda
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        conda install -y -q python=3.11 cmake=3.26 make=4.2 swig=4.0 "numpy<2" scipy=1.14 pytest=7.4 gflags=2.2
+
+        # install base packages for ARM64
+        if [ "${{ runner.arch }}" = "ARM64" ]; then
+          conda install -y -q -c conda-forge openblas=0.3.29 gxx_linux-aarch64=14.2 sysroot_linux-aarch64=2.17
+        fi
+
+        # install base packages for X86_64
+        if [ "${{ runner.arch }}" = "X64" ]; then
+          # TODO: merge this with ARM64
+          conda install -y -q -c conda-forge gxx_linux-64=14.2 sysroot_linux-64=2.17
+          conda install -y -q mkl=2022.2.1 mkl-devel=2022.2.1
+        fi
+
+        # no CUDA needed for ROCm so skip this
+        if [ "${{ inputs.rocm }}" = "ON" ]; then
+          :
+        # regular CUDA for GPU builds
+        elif [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.cuvs }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit=12.4 -c "nvidia/label/cuda-12.4.0"
+        # and CUDA from cuVS channel for cuVS builds
+        elif [ "${{ inputs.cuvs }}" = "ON" ]; then
+          conda install -y -q libcuvs=24.12 'cuda-version>=12.0,<=12.5' cuda-toolkit=12.4.1 gxx_linux-64=12.4 -c rapidsai -c conda-forge
+        fi
+
+        # install test packages
+        if [ "${{ inputs.rocm }}" = "ON" ]; then
+          : # skip torch install via conda, we need to install via pip to get
+            #  ROCm-enabled version until it's supported in conda by PyTorch
+        elif [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q "pytorch<2.5" pytorch-cuda=12.4 -c pytorch -c "nvidia/label/cuda-12.4.0"
+        else
+          conda install -y -q "pytorch<2.5" -c pytorch
+        fi
+    - name: ROCm - Install dependencies
+      if: inputs.rocm == 'ON'
+      shell: bash
+      run: |
+        # Update repos and install kmod, wget, gpg
+        sudo apt-get -qq update >/dev/null
+        sudo apt-get -qq install -y kmod wget gpg >/dev/null
+
+        # Get UBUNTU version name
+        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+
+        # Set ROCm version
+        ROCM_VERSION="6.2"
+
+        # Download, prepare, and install the package signing key
+        mkdir --parents --mode=0755 /etc/apt/keyrings
+        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+
+        # Add rocm repository
+        wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
+        rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
+        echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" | sudo tee /etc/apt/sources.list.d/rocm.list
+        sudo apt-get -qq update --allow-insecure-repositories >/dev/null
+        sudo apt-get -qq install -y --allow-unauthenticated \
+            "rocm-dev${ROCM_VERSION}" "rocm-utils${ROCM_VERSION}" \
+            "rocm-libs${ROCM_VERSION}" >/dev/null
+
+        # Fake presence of MI200-class accelerators
+        echo "gfx90a" | sudo tee /opt/rocm/bin/target.lst
+
+        # Cleanup
+        sudo apt-get -qq autoclean >/dev/null
+        sudo apt-get -qq clean >/dev/null
+        sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+    - name: Symblink system dependencies
+      if: inputs.rocm == 'ON'
+      shell: bash
+      run: |
+        # symblink system libraries for HIP compiler
+        sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
+        sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
+        sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0
+        sudo ln -s $HOME/miniconda3/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a /usr/lib64/libpthread_nonshared.a
+    - name: Build all targets
+      shell: bash
+      run: |
+        eval "$(conda shell.bash hook)"
+        conda activate
+        cmake -B build \
+              -DBUILD_TESTING=ON \
+              -DBUILD_SHARED_LIBS=ON \
+              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
+              -DFAISS_ENABLE_CUVS=${{ inputs.cuvs }} \
+              -DFAISS_ENABLE_ROCM=${{ inputs.rocm }} \
+              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
+              -DFAISS_ENABLE_C_API=ON \
+              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBLA_VENDOR=${{ runner.arch == 'X64' && 'Intel10_64_dyn' || '' }} \
+              -DCMAKE_CUDA_FLAGS=${{ runner.arch == 'X64' && '"-gencode arch=compute_75,code=sm_75"' || '' }} \
+              .
+        make -k -C build -j$(nproc)
+    - name: C++ tests
+      shell: bash
+      run: |
+        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
+        make -C build test
+    - name: C++ perf benchmarks
+      shell: bash
+      if: inputs.rocm == 'OFF'
+      run: |
+        find ./build/perf_tests/ -executable -type f -name "bench*" -exec '{}' -v \;
+    - name: Install Python extension
+      shell: bash
+      working-directory: build/faiss/python
+      run: |
+        $CONDA/bin/python setup.py install
+    - name: ROCm - install ROCm-enabled torch via pip
+      if: inputs.rocm == 'ON'
+      shell: bash
+      run: |
+        pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+    - name: Python tests (CPU only)
+      if: inputs.gpu == 'OFF'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+    - name: Python tests (CPU + GPU)
+      if: inputs.gpu == 'ON'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+        cp tests/common_faiss_tests.py faiss/gpu/test
+        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+    - name: Test avx2 loading
+      if: inputs.opt_level == 'avx2'
+      shell: bash
+      run: |
+        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
+        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
+    - name: Upload test results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results-arch=${{ runner.arch }}-opt=${{ inputs.opt_level }}-gpu=${{ inputs.gpu }}-cuvs=${{ inputs.cuvs }}-rocm=${{ inputs.rocm }}
+        path: test-results
+    - name: Check installed packages channel
+      shell: bash
+      run: |
+        # Shows that all installed packages are from conda-forge.
+        conda list --show-channel-urls
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_conda/action.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/actions/build_conda/action.yml
@@ -0,0 +1,107 @@
+name: Conda build
+description: Builds Faiss inside a Conda environment and uploads to repository when label is provided.
+inputs:
+  label:
+    description: "The label to be used for uploads to Conda."
+    default: ""
+    required: false
+  cuda:
+    description: "CUDA toolkit version to use."
+    default: ""
+    required: false
+  cuvs:
+    description: "Enable cuVS support."
+    default: ""
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Choose shell
+      shell: bash
+      id: choose_shell
+      run: |
+        # Use pwsh on Windows; bash everywhere else
+        if [ "${{ runner.os }}" != "Windows" ]; then
+          echo "shell=bash" >> "$GITHUB_OUTPUT"
+        else
+          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniforge-version: latest # ensures conda-forge channel is used.
+        channels: conda-forge
+        conda-remove-defaults: 'true'
+        # Set to runner.arch=aarch64 if we're on arm64 because
+        # there's no miniforge ARM64 package, just aarch64.
+        # They are the same thing, just named differently.
+        # However there is an ARM64 for macOS, so exclude that.
+        architecture: ${{ (runner.arch == 'ARM64' && runner.os != 'macOS') && 'aarch64' || runner.arch }}
+    - name: Install conda build tools
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        # Ensure starting packages are from conda-forge.
+        conda list --show-channel-urls
+        conda install -y -q "conda!=24.11.0"
+        conda install -y -q "conda-build!=24.11.0" "liblief=0.14.1"
+        conda list --show-channel-urls
+    - name: Enable anaconda uploads
+      if: inputs.label != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda install -y -q anaconda-client
+        conda config --set anaconda_upload yes
+    - name: Conda build (CPU)
+      if: inputs.label == '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss --python 3.11 -c pytorch
+    - name: Conda build (CPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
+    - name: Conda build (GPU)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU w/ cuVS)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
+            -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
+    - name: Conda build (GPU w/ cuVS) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
+    - name: Check installed packages channel
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        # Shows that all installed packages are from conda-forge.
+        conda list --show-channel-urls
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/autoclose.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/autoclose.yml
@@ -0,0 +1,23 @@
+name: Close Inactive Issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          only-labels: autoclose
+          days-before-issue-stale: 7
+          days-before-issue-close: 7
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-pull-request.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-pull-request.yml
@@ -0,0 +1,169 @@
+on:
+  workflow_call:
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  format:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install clang-format
+        run: |
+            sudo apt-get update -y
+            sudo apt-get install -y wget
+            sudo apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            sudo ./llvm.sh 18
+            sudo apt-get install -y git-core clang-format-18
+      - name: Verify clang-format
+        run: |
+            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
+            if git diff --quiet; then
+              echo "Formatting OK!"
+            else
+              echo "Formatting not OK!"
+              echo "------------------"
+              git --no-pager diff --color
+              exit 1
+            fi
+  linux-x86_64-cmake:
+    name: Linux x86_64 (cmake)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+  linux-x86_64-AVX2-cmake:
+    name: Linux x86_64 AVX2 (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx2
+  linux-x86_64-AVX512-cmake:
+    name: Linux x86_64 AVX512 (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-m7i.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512
+  linux-x86_64-AVX512_SPR-cmake:
+    name: Linux x86_64 AVX512_SPR (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-m7i.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512_spr
+  linux-x86_64-GPU-cmake:
+    name: Linux x86_64 GPU (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+  linux-x86_64-GPU-w-CUVS-cmake:
+    name: Linux x86_64 GPU w/ cuVS (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          cuvs: ON
+  linux-x86_64-GPU-w-ROCm-cmake:
+    name: Linux x86_64 GPU w/ ROCm (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-amd-MI200
+    container:
+      image: ubuntu:22.04
+      options: --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN
+    steps:
+      - name: Container setup
+        run: |
+            if [ -f /.dockerenv ]; then
+              apt-get update && apt-get install -y sudo && apt-get install -y git
+              git config --global --add safe.directory '*'
+            else
+              echo 'Skipping. Current job is not running inside a container.'
+            fi
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          rocm: ON
+  linux-arm64-SVE-cmake:
+    name: Linux arm64 SVE (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-r8g.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: sve
+        env:
+          # Context: https://github.com/facebookresearch/faiss/wiki/Troubleshooting#surprising-faiss-openmp-and-openblas-interaction
+          OPENBLAS_NUM_THREADS: '1'
+  linux-x86_64-conda:
+    name: Linux x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+  windows-x86_64-conda:
+    name: Windows x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+  linux-arm64-conda:
+    name: Linux arm64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-release.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build-release.yml
@@ -0,0 +1,144 @@
+on:
+  workflow_call:
+    secrets:
+      ANACONDA_API_TOKEN:
+        required: true
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-packages:
+    name: Linux x86_64 packages
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+  linux-x86_64-GPU-packages-CUDA-11-4-4:
+    name: Linux x86_64 GPU packages (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+          cuda: "11.4.4"
+  linux-x86_64-GPU-CUVS-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+          cuvs: "ON"
+          cuda: "11.8.0"
+  linux-x86_64-GPU-packages-CUDA-12-1-1:
+    name: Linux x86_64 GPU packages (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+          cuda: "12.1.1"
+  linux-x86_64-GPU-CUVS-packages-CUDA12-4-0:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+          cuvs: "ON"
+          cuda: "12.4.0"
+  windows-x86_64-packages:
+    name: Windows x86_64 packages
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+  osx-arm64-packages:
+    name: OSX arm64 packages
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
+  linux-arm64-packages:
+    name: Linux arm64 packages
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - name: Build and Package (conda)
+        uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: main
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/build.yml
@@ -0,0 +1,17 @@
+name: Build
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    tags:
+      - 'v*'
+jobs:
+  build-pull-request:
+    uses: ./.github/workflows/build-pull-request.yml
+  build-release:
+    uses: ./.github/workflows/build-release.yml
+    secrets:
+      ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/nightly.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/nightly.yml
@@ -0,0 +1,148 @@
+name: Nightly
+on:
+  schedule:
+    - cron:  '10 6 * * *'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-nightly:
+    name: Linux x86_64 nightlies
+    runs-on: 4-core-ubuntu
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-x86_64-GPU-CUDA-11-4-4-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "11.4.4"
+  linux-x86_64-GPU-CUVS-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuvs: "ON"
+          cuda: "11.8.0"
+  linux-x86_64-GPU-CUDA-12-1-1-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "12.1.1"
+  linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuvs: "ON"
+          cuda: "12.4.0"
+  windows-x86_64-nightly:
+    name: Windows x86_64 nightlies
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  osx-arm64-nightly:
+    name: OSX arm64 nightlies
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-arm64-nightly:
+    name: Linux arm64 nightlies
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  auto-retry:
+    name: Auto retry on failure
+    if: fromJSON(github.run_attempt) < 2
+    runs-on: ubuntu-latest
+    steps:
+      - name: Start rerun workflow
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          gh workflow run retry_build.yml \
+            -F run_id=${{ github.run_id }}
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/publish-docs.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/publish-docs.yml
@@ -0,0 +1,44 @@
+name: Publish Docs
+on:
+  page_build:
+    branches:
+      - gh-pages
+    paths-ignore:
+      - 'docs/**'
+  workflow_run:
+    workflows: [update-doxygen]
+    types:
+      - completed
+jobs:
+  build_and_publish:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Checkout gh-pages
+        run: |
+          git fetch origin gh-pages
+          git checkout gh-pages
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Generate html
+        run: |
+          make html
+          git rm -rf docs
+          mv _build/html docs
+          touch docs/.nojekyll
+      - name: Push changes
+        run: |
+          git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
+          git config --global user.name "$GITHUB_ACTOR"
+          git add docs
+          if [ -n "$(git status --porcelain)" ]
+          then
+            git commit docs -m "Sphinx rebuild ($(git rev-parse --short gh-pages))."
+            git push origin gh-pages
+          fi
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/retry_build.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/retry_build.yml
@@ -0,0 +1,33 @@
+name: Retry Build
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        required: true
+jobs:
+  rerun-on-failure:
+    permissions: write-all
+    runs-on: ubuntu-latest
+    steps:
+      - name: rerun ${{ inputs.run_id }}
+        env:
+          GH_REPO: ${{ github.repository }}
+          GH_TOKEN: ${{ github.token }}
+          GH_DEBUG: api
+        run: |
+          # status can be one of "queued", "in_progress", "completed", "waiting", "requested", "pending"
+          # https://docs.github.com/en/rest/checks/runs
+          # while not completed, sleep for 10 minutes
+          while gh run view ${{ inputs.run_id }} --json status | grep -v completed
+          do
+            echo Workflow in progress - sleeping for 10 minutes then checking again
+            sleep 10m
+          done
+
+          # Only retry if there are failed jobs
+          if gh run view ${{ inputs.run_id }} --exit-status; then
+            echo Workflow succeeded - no retry necessary.
+          else
+            echo Workflow failed - initiating retry.
+            gh run rerun ${{ inputs.run_id }} --failed
+          fi
--- a/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/update-doxygen.yml
+++ b/packages/leann-backend-hnsw/third_party/faiss/.github/workflows/update-doxygen.yml
@@ -0,0 +1,40 @@
+name: Update Doxygen
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'faiss/**'
+jobs:
+  doxygen:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y doxygen
+          python -m pip install --upgrade pip
+          pip install breathe
+      - name: Generate doxygen xml
+        run: doxygen
+      - name: Push changes
+        run: |
+          git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
+          git config --global user.name "$GITHUB_ACTOR"
+          mkdir ./tmp
+          mv xml ./tmp/xml
+          git fetch origin gh-pages
+          git checkout gh-pages
+          git rm -rf xml cpp_api
+          mv ./tmp/xml ./xml
+          breathe-apidoc -o cpp_api xml
+          git add xml cpp_api
+          if [ -n "$(git status --porcelain)" ]
+          then
+            git commit -m "Update API docs ($(git rev-parse --short main))."
+            git push origin gh-pages
+          fi
--- a/packages/leann-backend-hnsw/third_party/faiss/.gitignore
+++ b/packages/leann-backend-hnsw/third_party/faiss/.gitignore
@@ -0,0 +1,26 @@
+*.swp
+*.swo
+*.o
+*.a
+*.dSYM
+*.so
+*.dylib
+*.pyc
+*~
+/build/
+/config.*
+/aclocal.m4
+/autom4te.cache/
+/makefile.inc
+/bin/
+/c_api/bin/
+/c_api/gpu/bin/
+/tests/test
+/tests/gtest/
+faiss/python/swigfaiss_avx2.swig
+faiss/python/swigfaiss_avx512.swig
+faiss/python/swigfaiss_avx512_spr.swig
+faiss/python/swigfaiss_sve.swig
+.cache/
+compile_commands.json
+sift/
--- a/packages/leann-backend-hnsw/third_party/faiss/.vscode/launch.json
+++ b/packages/leann-backend-hnsw/third_party/faiss/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Build Demo",
+            "type": "lldb",
+            "request": "launch",
+            "program": "${workspaceFolder}/../.venv/bin/python",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "${workspaceFolder}/demo/build_demo.py"
+            ],
+        },
+    ]
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/CHANGELOG.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/CHANGELOG.md
@@ -0,0 +1,482 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+## [Unreleased]
+
+## [1.10.0] - 2025-01-30
+
+
+Added
+- Add desc_name to dataset descriptor (#3935)
+- implement ST_norm_from_LUT for the ResidualQuantizer (#3917)
+- Add example of how to build, link, and test an external SWIG module (#3922)
+- add copyright header (#3948)
+- Add some SVE implementations (#3933)
+- Enable linting: lint config changes plus arc lint command (#3966)
+- Re-add example of how to build, link, and test an external SWIG module (#3981)
+- demo: IndexPQ: separate codes from codebook (#3987)
+- add all wrapped indexes to the index_read (#3988)
+- add validity check AlignedTableTightAlloc clear method (#3997)
+- Add index binary to telemetry (#4001)
+- Add VectorTransform read from filename to the C API (#3970)
+- Added IndexLSH to the demo (#4009)
+- write distributed_kmeans centroids and assignments to hive tables (#4017)
+- introduce data splits in dataset descriptor (#4012)
+- Faiss GPU: bfloat16 brute-force kNN support (#4018)
+- ROCm support for bfloat16 (#4039)
+- Unit tests for distances_simd.cpp (#4058)
+- add cuda-toolkit for GPU (#4057)
+- Add more unit testing for IndexHNSW [1/n] (#4054)
+- Add more unit testing for IndexHNSW [2/n] (#4056)
+- Add more unit testing for HNSW [3/n] (#4059)
+- Add more unit testing for HNSW [4/n] (#4061)
+- Add more unit tests for index_read and index_write (#4068)
+- Add testing for utils/hamming.cpp (#4079)
+- Test sa_decode methd on IndexIVFFlat (#4098)
+- Conditionally compile extras like benchmarks and demos (#4094)
+- Add a new architecture mode: 'avx512_spr'. (#4025)
+- Use _mm512_popcnt_epi64 to speedup hamming distance evaluation. (#4020)
+- PQ with pytorch (#4116)
+- add range_search() to IndexRefine (#4022)
+- Expose accumulate_to_mem from faiss interface (#4099)
+- Windows Arm64 support (#4087)
+- add test to cover GPU (#4130)
+- Added support for building without MKL (#4147)
+
+Changed
+- Move train, build and search to their respective operators (#3934)
+- PQFS into Index trainer (#3941)
+- Place a useful cmake function 'link_to_faiss_lib' into a separate file (#3939)
+- Cache device major version value to avoid multiple calls of getCudaDeviceProperties (#3950)
+- Consolidate set_target_properties() calls in faiss/CMakeLists.txt (#3973)
+- Removing Manual Hipify Build Step (#3962)
+- Allow to replace graph structure for NSG graphs (#3975)
+- Adjust nightly build (#3978)
+- Update RAFT CI with pytorch 2.4.1 (#3980)
+- Moved add_sa_codes, sa_code_size to Index, IndexBinary base classes (#3989)
+- Update autoclose.yml (#4000)
+- Migrate from RAFT to CUVS (#3549)
+- Pin to numpy<2 (#4033)
+- (1/n) - Preload datasets in manifold so that subsequent stages of training, indexing and search can use those instead of each trainer or indexer downloading data. (#4034)
+- Constrain conda version for Windows build (#4040)
+- Updates to faiss-gpu-cuvs nightly pkg (#4032)
+- pin the dependecies version for x86_64 (#4046)
+- pin arm64 dependency (#4060)
+- Pin conda build (#4062)
+- Improve naming due to codemod (#4063)
+- Improve naming due to codemod (#4064)
+- Improve naming due to codemod (#4065)
+- separare the github build into two conditions (#4066)
+- Improve naming due to codemod (#4070)
+- improve naming due to codemod (#4067)
+- improve naming due to codemod (#4071)
+- improve naming due to codemod (#4072)
+- fix nightily build (#4080)
+- Change github action workflows name (#4083)
+- Resolve Packaging Issues (#4044)
+- Update __init__.py (#4086)
+- Exhaustive IVF probing in scalar quantizer tests (#4075)
+- Pin Nightlies with testing on PR (#4088)
+- Update benchmarking library code to work for IdMap index as well (#4093)
+- Update action.yml (#4100)
+- Upgrade CUVS to 24.12 (#4021)
+- Link cuVS Docs (#4084)
+- Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors (#4109)
+- enable quiet mode for conda install (#4112)
+- Disable retry build (#4124)
+- Add ngpu default argument to knn_ground_truth (#4123)
+- Update code comment to reflect the range of IF from [1, k] (#4139)
+- Reenable auto retry workflow (#4140)
+- Migration off defaults to conda-forge channel (#4126)
+- Benchmarking Scripts for cuVS Index, more docs updates (#4117)
+
+Fixed
+- Fix total_rows (#3942)
+- Fix INSTALL.md due to failure of conflict resolving (#3915)
+- Back out "Add example of how to build, link, and test an external SWIG module" (#3954)
+- Fix shadowed variable in faiss/IndexPQ.cpp (#3959)
+- Fix shadowed variable in faiss/IndexIVFAdditiveQuantizer.cpp (#3958)
+- Fix shadowed variable in faiss/impl/HNSW.cpp (#3961)
+- Fix shadowed variable in faiss/impl/simd_result_handlers.h (#3960)
+- Fix shadowed variable in faiss/utils/NeuralNet.cpp (#3952)
+- Resolve "incorrect-portions-license" errors: add no license lint to top of GPU files with both licenses (#3965)
+- Resolve "duplicate-license-header": Find and replace duplicate license headers (#3967)
+- fix some more nvidia licenses that get erased (#3977)
+- fix merge_flat_ondisk stress run failures (#3999)
+- Fix reverse_index_factory formatting of ScalarQuantizers (#4003)
+- Fix shadowed variable in faiss/IndexAdditiveQuantizer.cpp (#4011)
+- facebook-unused-include-check in fbcode/faiss (#4029)
+- fix linter (#4035)
+- Some chore fixes (#4010)
+- Fix unused variable compilation error (#4041)
+- stop dealloc of coarse quantizer when it is deleted (#4045)
+- Fix SCD Table test flakiness (#4069)
+- Fix IndexIVFFastScan reconstruct_from_offset method (#4095)
+- more fast-scan reconstruction (#4128)
+- Fix nightly cuVS 11.8.0 failure (#4149)
+- Correct capitalization of FAISS to Faiss (#4155)
+- Fix cuVS 12.4.0 nightly failure (#4153)
+
+Deprecated
+- Remove unused-variable in dumbo/backup/dumbo/service/tests/ChainReplicatorTests.cpp (#4024)
+- remove inconsistent oom exception test (#4052)
+- Remove unused(and wrong) io macro (#4122)
+
+
+## [1.9.0] - 2024-10-04
+### Added
+- Add AVX-512 implementation for the distance and scalar quantizer functions. (#3853)
+- Allow k and M suffixes in IVF indexes (#3812)
+- add reconstruct support to additive quantizers (#3752)
+- introduce options for reducing the overhead for a clustering procedure (#3731)
+- Add hnsw search params for bounded queue option (#3748)
+- ROCm support (#3462)
+- Add sve targets (#2886)
+- add get_version() for c_api (#3688)
+- QINCo implementation in CPU Faiss (#3608)
+- Add search functionality to FlatCodes (#3611)
+- add dispatcher for VectorDistance and ResultHandlers (#3627)
+- Add SQ8bit signed quantization (#3501)
+- Add ABS_INNER_PRODUCT metric (#3524)
+- Interop between CAGRA and HNSW (#3252)
+- add skip_storage flag to HNSW (#3487)
+- QT_bf16 for scalar quantizer for bfloat16 (#3444)
+- Implement METRIC.NaNEuclidean (#3414)
+- TimeoutCallback C++ and Python (#3417)
+- support big-endian machines (#3361)
+- Support for Remove ids from IVFPQFastScan index (#3354)
+- Implement reconstruct_n for GPU IVFFlat indexes (#3338)
+- Support of skip_ids in merge_from_multiple function of OnDiskInvertedLists (#3327)
+- Add the ability to clone and read binary indexes to the C API. (#3318)
+- AVX512 for PQFastScan (#3276)
+
+### Changed
+- faster hnsw CPU index training (#3822)
+- Some small improvements. (#3692)
+- First attempt at LSH matching with nbits (#3679)
+- Set verbosoe before train (#3619)
+- Remove duplicate NegativeDistanceComputer instances (#3450)
+- interrupt for NNDescent (#3432)
+- Get rid of redundant instructions in ScalarQuantizer (#3430)
+- PowerPC, improve code generation for function fvec_L2sqr (#3416)
+- Unroll loop in lookup_2_lanes (#3364)
+- Improve filtering & search parameters propagation (#3304)
+- Change index_cpu_to_gpu to throw for indices not implemented on GPU (#3336)
+- Throw when attempting to move IndexPQ to GPU (#3328)
+- Skip HNSWPQ sdc init with new io flag (#3250)
+
+### Fixed
+- FIx a bug for a non-simdlib code of ResidualQuantizer (#3868)
+- assign_index should default to null (#3855)
+- Fix an incorrectly counted the number of computed distances for HNSW (#3840)
+- Add error for overflowing nbits during PQ construction (#3833)
+- Fix radius search with HSNW and IP (#3698)
+- fix algorithm of spreading vectors over shards (#3374)
+- Fix IndexBinary.assign Python method (#3384)
+- Few fixes in bench_fw to enable IndexFromCodec (#3383)
+- Fix the endianness issue in AIX while running the benchmark. (#3345)
+- Fix faiss swig build with version > 4.2.x (#3315)
+- Fix problems when using 64-bit integers. (#3322)
+- Fix IVFPQFastScan decode function (#3312)
+- Handling FaissException in few destructors of ResultHandler.h (#3311)
+- Fix HNSW stats (#3309)
+- AIX compilation fix for io classes (#3275)
+
+
+## [1.8.0] - 2024-02-27
+### Added
+- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
+- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
+- Added a context parameter to InvertedLists and InvertedListsIterator
+- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
+- Introduced Offline IVF framework powered by Faiss big batch search
+- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
+- Generalized ResultHandler and supported range search for HNSW and FastScan
+- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
+- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
+- Supported large two-level clustering
+- Added support for Python 3.11 and 3.12
+- Added support for CUDA 12
+
+### Changed
+- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
+- Splitted off RQ encoding steps to another file
+- Supported better NaN handling
+- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
+
+### Fixed
+- Fixed DeviceVector reallocations in Faiss GPU
+- Used efSearch from params if provided in HNSW search
+- Fixed warp synchronous behavior in Faiss GPU CUDA 12
+
+
+## [1.7.4] - 2023-04-12
+### Added
+- Added big batch IVF search for conducting efficient search with big batches of queries
+- Checkpointing in big batch search support
+- Precomputed centroids support
+- Support for iterable inverted lists for eg. key value stores
+- 64-bit indexing arithmetic support in FAISS GPU
+- IndexIVFShards now handle IVF indexes with a common quantizer
+- Jaccard distance support
+- CodePacker for non-contiguous code layouts
+- Approximate evaluation of top-k distances for ResidualQuantizer and IndexBinaryFlat
+- Added support for 12-bit PQ / IVFPQ fine quantizer decoders for standalone vector codecs (faiss/cppcontrib)
+- Conda packages for osx-arm64 (Apple M1) and linux-aarch64 (ARM64) architectures
+- Support for Python 3.10
+
+### Removed
+- CUDA 10 is no longer supported in precompiled packages
+- Removed Python 3.7 support for precompiled packages
+- Removed constraint for using fine quantizer with no greater than 8 bits for IVFPQ, for example, now it is possible to use IVF256,PQ10x12 for a CPU index
+
+### Changed
+- Various performance optimizations for PQ / IVFPQ for AVX2 and ARM for training (fused distance+nearest kernel), search (faster kernels for distance_to_code() and scan_list_*()) and vector encoding
+- A magnitude faster CPU code for LSQ/PLSQ training and vector encoding (reworked code)
+- Performance improvements for Hamming Code computations for AVX2 and ARM (reworked code)
+- Improved auto-vectorization support for IP and L2 distance computations (better handling of pragmas)
+- Improved ResidualQuantizer vector encoding (pooling memory allocations, avoid r/w to a temporary buffer)
+
+### Fixed
+- HSNW bug fixed which improves the recall rate! Special thanks to zh Wang @hhy3 for this.
+- Faiss GPU IVF large query batch fix
+- Faiss + Torch fixes, re-enable k = 2048
+- Fix the number of distance computations to match max_codes parameter
+- Fix decoding of large fast_scan blocks
+
+
+## [1.7.3] - 2022-11-3
+### Added
+- Added sparse k-means routines and moved the generic kmeans to contrib
+- Added FlatDistanceComputer for all FlatCodes indexes
+- Support for fast accumulation of 4-bit LSQ and RQ
+- Added product additive quantization
+- Support per-query search parameters for many indexes + filtering by ids
+- write_VectorTransform and read_vectorTransform were added to the public API (by @AbdelrahmanElmeniawy)
+- Support for IDMap2 in index_factory by adding "IDMap2" to prefix or suffix of the input String (by @AbdelrahmanElmeniawy)
+- Support for merging all IndexFlatCodes descendants (by @AbdelrahmanElmeniawy)
+- Remove and merge features for IndexFastScan (by @AbdelrahmanElmeniawy)
+- Performance improvements: 1) specialized the AVX2 pieces of code speeding up certain hotspots, 2) specialized kernels for vector codecs (this can be found in faiss/cppcontrib)
+
+
+### Fixed
+- Fixed memory leak in OnDiskInvertedLists::do_mmap when the file is not closed (by @AbdelrahmanElmeniawy)
+- LSH correctly throws error for metric types other than METRIC_L2 (by @AbdelrahmanElmeniawy)
+
+## [1.7.2] - 2021-12-15
+### Added
+- Support LSQ on GPU (by @KinglittleQ)
+- Support for exact 1D kmeans (by @KinglittleQ)
+
+## [1.7.1] - 2021-05-27
+### Added
+- Support for building C bindings through the `FAISS_ENABLE_C_API` CMake option.
+- Serializing the indexes with the python pickle module
+- Support for the NNDescent k-NN graph building method (by @KinglittleQ)
+- Support for the NSG graph indexing method (by @KinglittleQ)
+- Residual quantizers: support as codec and unoptimized search
+- Support for 4-bit PQ implementation for ARM (by @vorj, @n-miyamoto-fixstars, @LWisteria, and @matsui528)
+- Implementation of Local Search Quantization (by @KinglittleQ)
+
+### Changed
+- The order of xb an xq was different between `faiss.knn` and `faiss.knn_gpu`.
+Also the metric argument was called distance_type.
+- The typed vectors (LongVector, LongLongVector, etc.) of the SWIG interface have
+been deprecated. They have been replaced with Int32Vector, Int64Vector, etc. (by h-vetinari)
+
+### Fixed
+- Fixed a bug causing kNN search functions for IndexBinaryHash and
+IndexBinaryMultiHash to return results in a random order.
+- Copy constructor of AlignedTable had a bug leading to crashes when cloning
+IVFPQ indices.
+
+## [1.7.0] - 2021-01-27
+
+## [1.6.5] - 2020-11-22
+
+## [1.6.4] - 2020-10-12
+### Added
+- Arbitrary dimensions per sub-quantizer now allowed for `GpuIndexIVFPQ`.
+- Brute-force kNN on GPU (`bfKnn`) now accepts `int32` indices.
+- Nightly conda builds now available (for CPU).
+- Faiss is now supported on Windows.
+
+## [1.6.3] - 2020-03-24
+### Added
+- Support alternative distances on GPU for GpuIndexFlat, including L1, Linf and
+Lp metrics.
+- Support METRIC_INNER_PRODUCT for GpuIndexIVFPQ.
+- Support float16 coarse quantizer for GpuIndexIVFFlat and GpuIndexIVFPQ. GPU
+Tensor Core operations (mixed-precision arithmetic) are enabled on supported
+hardware when operating with float16 data.
+- Support k-means clustering with encoded vectors. This makes it possible to
+train on larger datasets without decompressing them in RAM, and is especially
+useful for binary datasets (see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py#L92).
+- Support weighted k-means. Weights can be associated to each training point
+(see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py).
+- Serialize callback in python, to write to pipes or sockets (see
+https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning).
+- Reconstruct arbitrary ids from IndexIVF + efficient remove of a small number
+of ids. This avoids 2 inefficiencies: O(ntotal) removal of vectors and
+IndexIDMap2 on top of indexIVF. Documentation here:
+https://github.com/facebookresearch/faiss/wiki/Special-operations-on-indexes.
+- Support inner product as a metric in IndexHNSW (see
+https://github.com/facebookresearch/faiss/blob/main/tests/test_index.py#L490).
+- Support PQ of sizes other than 8 bit in IndexIVFPQ.
+- Demo on how to perform searches sequentially on an IVF index. This is useful
+for an OnDisk index with a very large batch of queries. In that case, it is
+worthwhile to scan the index sequentially (see
+https://github.com/facebookresearch/faiss/blob/main/tests/test_ivflib.py#L62).
+- Range search support for most binary indexes.
+- Support for hashing-based binary indexes (see
+https://github.com/facebookresearch/faiss/wiki/Binary-indexes).
+
+### Changed
+- Replaced obj table in Clustering object: now it is a ClusteringIterationStats
+structure that contains additional statistics.
+
+### Removed
+- Removed support for useFloat16Accumulator for accumulators on GPU (all
+accumulations are now done in float32, regardless of whether float16 or float32
+input data is used).
+
+### Fixed
+- Some python3 fixes in benchmarks.
+- Fixed GpuCloner (some fields were not copied, default to no precomputed tables
+with IndexIVFPQ).
+- Fixed support for new pytorch versions.
+- Serialization bug with alternative distances.
+- Removed test on multiple-of-4 dimensions when switching between blas and AVX
+implementations.
+
+## [1.6.2] - 2020-03-10
+
+## [1.6.1] - 2019-12-04
+
+## [1.6.0] - 2019-09-24
+### Added
+- Faiss as a codec: We introduce a new API within Faiss to encode fixed-size
+vectors into fixed-size codes. The encoding is lossy and the tradeoff between
+compression and reconstruction accuracy can be adjusted.
+- ScalarQuantizer support for GPU, see gpu/GpuIndexIVFScalarQuantizer.h. This is
+particularly useful as GPU memory is often less abundant than CPU.
+- Added easy-to-use serialization functions for indexes to byte arrays in Python
+(faiss.serialize_index, faiss.deserialize_index).
+- The Python KMeans object can be used to use the GPU directly, just add
+gpu=True to the constuctor see gpu/test/test_gpu_index.py test TestGPUKmeans.
+
+### Changed
+- Change in the code layout: many C++ sources are now in subdirectories impl/
+and utils/.
+
+## [1.5.3] - 2019-06-24
+### Added
+- Basic support for 6 new metrics in CPU IndexFlat and IndexHNSW (https://github.com/facebookresearch/faiss/issues/848).
+- Support for IndexIDMap/IndexIDMap2 with binary indexes (https://github.com/facebookresearch/faiss/issues/780).
+
+### Changed
+- Throw python exception for OOM (https://github.com/facebookresearch/faiss/issues/758).
+- Make DistanceComputer available for all random access indexes.
+- Gradually moving from long to uint64_t for portability.
+
+### Fixed
+- Slow scanning of inverted lists (https://github.com/facebookresearch/faiss/issues/836).
+
+## [1.5.2] - 2019-05-28
+### Added
+- Support for searching several inverted lists in parallel (parallel_mode != 0).
+- Better support for PQ codes where nbit != 8 or 16.
+- IVFSpectralHash implementation: spectral hash codes inside an IVF.
+- 6-bit per component scalar quantizer (4 and 8 bit were already supported).
+- Combinations of inverted lists: HStackInvertedLists and VStackInvertedLists.
+- Configurable number of threads for OnDiskInvertedLists prefetching (including
+0=no prefetch).
+- More test and demo code compatible with Python 3 (print with parentheses).
+
+### Changed
+- License was changed from BSD+Patents to MIT.
+- Exceptions raised in sub-indexes of IndexShards and IndexReplicas are now
+propagated.
+- Refactored benchmark code: data loading is now in a single file.
+
+## [1.5.1] - 2019-04-05
+### Added
+- MatrixStats object, which reports useful statistics about a dataset.
+- Option to round coordinates during k-means optimization.
+- An alternative option for search in HNSW.
+- Support for range search in IVFScalarQuantizer.
+- Support for direct uint_8 codec in ScalarQuantizer.
+- Better support for PQ code assignment with external index.
+- Support for IMI2x16 (4B virtual centroids).
+- Support for k = 2048 search on GPU (instead of 1024).
+- Support for renaming an ondisk invertedlists.
+- Support for nterrupting computations with interrupt signal (ctrl-C) in python.
+- Simplified build system (with --with-cuda/--with-cuda-arch options).
+
+### Changed
+- Moved stats() and imbalance_factor() from IndexIVF to InvertedLists object.
+- Renamed IndexProxy to IndexReplicas.
+- Most CUDA mem alloc failures now throw exceptions instead of terminating on an
+assertion.
+- Updated example Dockerfile.
+- Conda packages now depend on the cudatoolkit packages, which fixes some
+interferences with pytorch. Consequentially, faiss-gpu should now be installed
+by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
+
+## [1.5.0] - 2018-12-19
+### Added
+- New GpuIndexBinaryFlat index.
+- New IndexBinaryHNSW index.
+
+## [1.4.0] - 2018-08-30
+### Added
+- Automatic tracking of C++ references in Python.
+- Support for non-intel platforms, some functions optimized for ARM.
+- Support for overriding nprobe for concurrent searches.
+- Support for floating-point quantizers in binary indices.
+
+### Fixed
+- No more segfaults due to Python's GC.
+- GpuIndexIVFFlat issues for float32 with 64 / 128 dims.
+- Sharding of flat indexes on GPU with index_cpu_to_gpu_multiple.
+
+## [1.3.0] - 2018-07-10
+### Added
+- Support for binary indexes (IndexBinaryFlat, IndexBinaryIVF).
+- Support fp16 encoding in scalar quantizer.
+- Support for deduplication in IndexIVFFlat.
+- Support for index serialization.
+
+### Fixed
+- MMAP bug for normal indices.
+- Propagation of io_flags in read func.
+- k-selection for CUDA 9.
+- Race condition in OnDiskInvertedLists.
+
+## [1.2.1] - 2018-02-28
+### Added
+- Support for on-disk storage of IndexIVF data.
+- C bindings.
+- Extended tutorial to GPU indices.
+
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.9.0...HEAD
+[1.9.0]: https://github.com/facebookresearch/faiss/compare/v1.8.0...v1.9.0
+[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
+[1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
+[1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
+[1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
+[1.7.1]: https://github.com/facebookresearch/faiss/compare/v1.7.0...v1.7.1
+[1.7.0]: https://github.com/facebookresearch/faiss/compare/v1.6.5...v1.7.0
+[1.6.5]: https://github.com/facebookresearch/faiss/compare/v1.6.4...v1.6.5
+[1.6.4]: https://github.com/facebookresearch/faiss/compare/v1.6.3...v1.6.4
+[1.6.3]: https://github.com/facebookresearch/faiss/compare/v1.6.2...v1.6.3
+[1.6.2]: https://github.com/facebookresearch/faiss/compare/v1.6.1...v1.6.2
+[1.6.1]: https://github.com/facebookresearch/faiss/compare/v1.6.0...v1.6.1
+[1.6.0]: https://github.com/facebookresearch/faiss/compare/v1.5.3...v1.6.0
+[1.5.3]: https://github.com/facebookresearch/faiss/compare/v1.5.2...v1.5.3
+[1.5.2]: https://github.com/facebookresearch/faiss/compare/v1.5.1...v1.5.2
+[1.5.1]: https://github.com/facebookresearch/faiss/compare/v1.5.0...v1.5.1
+[1.5.0]: https://github.com/facebookresearch/faiss/compare/v1.4.0...v1.5.0
+[1.4.0]: https://github.com/facebookresearch/faiss/compare/v1.3.0...v1.4.0
+[1.3.0]: https://github.com/facebookresearch/faiss/compare/v1.2.1...v1.3.0
+[1.2.1]: https://github.com/facebookresearch/faiss/releases/tag/v1.2.1
--- a/packages/leann-backend-hnsw/third_party/faiss/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/third_party/faiss/CMakeLists.txt
@@ -0,0 +1,126 @@
+# @lint-ignore-every LICENSELINT
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# =============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
+
+set(FAISS_LANGUAGES CXX)
+
+if(FAISS_ENABLE_GPU)
+  if (FAISS_ENABLE_ROCM)
+    list(APPEND FAISS_LANGUAGES HIP)
+    list(PREPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake")
+    list(PREPEND CMAKE_PREFIX_PATH "/opt/rocm")
+  else()
+    list(APPEND FAISS_LANGUAGES CUDA)
+  endif()
+endif()
+
+if(FAISS_ENABLE_CUVS)
+include(cmake/thirdparty/fetch_rapids.cmake)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cuda_init_architectures(faiss)
+rapids_cuda_init_architectures(pyfaiss)
+rapids_cuda_init_architectures(faiss_c_library)
+endif()
+
+project(faiss
+  VERSION 1.10.0
+  DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
+  HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+  LANGUAGES ${FAISS_LANGUAGES})
+include(GNUInstallDirs)
+
+set(CMAKE_CXX_STANDARD 17)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+# Valid values are "generic", "avx2", "avx512", "avx512_spr", "sve".
+option(FAISS_OPT_LEVEL "" "generic")
+option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
+option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
+option(FAISS_ENABLE_ROCM "Enable ROCm for GPU indexes." OFF)
+option(FAISS_ENABLE_MKL "Enable MKL." ON)
+option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
+option(FAISS_ENABLE_C_API "Build C API." OFF)
+option(FAISS_ENABLE_EXTRAS "Build extras like benchmarks and demos" ON)
+option(FAISS_USE_LTO "Enable Link-Time optimization" OFF)
+
+if(FAISS_ENABLE_GPU)
+  if(FAISS_ENABLE_ROCM)
+    enable_language(HIP)
+    add_definitions(-DUSE_AMD_ROCM)
+    find_package(HIP REQUIRED)
+    find_package(hipBLAS REQUIRED)
+    set(GPU_EXT_PREFIX "hip")
+    execute_process(COMMAND ${PROJECT_SOURCE_DIR}/faiss/gpu/hipify.sh)
+  else ()
+    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+    enable_language(CUDA)
+    set(GPU_EXT_PREFIX "cu")
+  endif()
+endif()
+
+if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
+   find_package(cuvs)
+ endif()
+
+add_subdirectory(faiss)
+
+if(FAISS_ENABLE_GPU)
+  if(FAISS_ENABLE_ROCM)
+    add_subdirectory(faiss/gpu-rocm)
+  else()
+    add_subdirectory(faiss/gpu)
+  endif()
+endif()
+
+if(FAISS_ENABLE_PYTHON)
+  add_subdirectory(faiss/python)
+endif()
+
+if(FAISS_ENABLE_C_API)
+  add_subdirectory(c_api)
+endif()
+
+if(FAISS_ENABLE_EXTRAS)
+  add_subdirectory(demos)
+  add_subdirectory(benchs)
+  add_subdirectory(tutorial/cpp)
+endif()
+
+# CTest must be included in the top level to enable `make test` target.
+include(CTest)
+if(BUILD_TESTING)
+  add_subdirectory(tests)
+  add_subdirectory(perf_tests)
+  if(FAISS_ENABLE_GPU)
+    if(FAISS_ENABLE_ROCM)
+      add_subdirectory(faiss/gpu-rocm/test)
+    else()
+      add_subdirectory(faiss/gpu/test)
+    endif()
+  endif()
+endif()
--- a/packages/leann-backend-hnsw/third_party/faiss/CODE_OF_CONDUCT.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/CODE_OF_CONDUCT.md
@@ -0,0 +1,2 @@
+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
--- a/packages/leann-backend-hnsw/third_party/faiss/CONTRIBUTING.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/CONTRIBUTING.md
@@ -0,0 +1,52 @@
+# Contributing to Faiss
+
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Our Development Process
+
+We mainly develop Faiss within Facebook. Sometimes, we will sync the
+github version of Faiss with the internal state.
+
+## Pull Requests
+
+We welcome pull requests that add significant value to Faiss. If you plan to do
+a major development and contribute it back to Faiss, please contact us first before
+putting too much effort into it.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+There is a Facebook internal test suite for Faiss, and we need to run
+all changes to Faiss through it.
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style
+
+* 4 spaces for indentation in C++ (no tabs)
+* 80 character line length (both for C++ and Python)
+* C++ language level: C++17
+
+## License
+
+By contributing to Faiss, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/packages/leann-backend-hnsw/third_party/faiss/Doxyfile
+++ b/packages/leann-backend-hnsw/third_party/faiss/Doxyfile
--- a/packages/leann-backend-hnsw/third_party/faiss/INSTALL.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/INSTALL.md
@@ -0,0 +1,325 @@
+# Installing Faiss via conda
+
+The supported way to install Faiss is through [conda](https://docs.conda.io).
+Stable releases are pushed regularly to the pytorch conda channel, as well as
+pre-release nightly builds.
+
+- The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
+- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
+- faiss-gpu-cuvs [^1] package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 24.12, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4.
+
+To install the latest stable release:
+
+``` shell
+# CPU-only version
+$ conda install -c pytorch faiss-cpu=1.10.0
+
+# GPU(+CPU) version
+$ conda install -c pytorch -c nvidia faiss-gpu=1.10.0
+
+# GPU(+CPU) version with NVIDIA cuVS
+$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.10.0
+
+# GPU(+CPU) version using AMD ROCm not yet available
+```
+
+For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
+
+For faiss-gpu-cuvs, the rapidsai, conda-forge and nvidia channels are required.
+
+Nightly pre-release packages can be installed as follows:
+
+``` shell
+# CPU-only version
+$ conda install -c pytorch/label/nightly faiss-cpu
+
+# GPU(+CPU) version
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.10.0
+
+# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4)
+conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
+
+# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8)
+conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
+
+# GPU(+CPU) version using AMD ROCm not yet available
+```
+In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
+
+A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
+```
+conda create --name faiss_1.8.0
+conda activate faiss_1.8.0
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
+```
+
+## Installing from conda-forge
+
+Faiss is also being packaged by [conda-forge](https://conda-forge.org/), the
+community-driven packaging ecosystem for conda. The packaging effort is
+collaborating with the Faiss team to ensure high-quality package builds.
+
+Due to the comprehensive infrastructure of conda-forge, it may even happen that
+certain build combinations are supported in conda-forge that are not available
+through the pytorch channel. To install, use
+
+``` shell
+# CPU version
+$ conda install -c conda-forge faiss-cpu
+
+# GPU version
+$ conda install -c conda-forge faiss-gpu
+
+# NVIDIA cuVS and AMD ROCm version not yet available
+```
+
+You can tell which channel your conda packages come from by using `conda list`.
+If you are having problems using a package built by conda-forge, please raise
+an [issue](https://github.com/conda-forge/faiss-split-feedstock/issues) on the
+conda-forge package "feedstock".
+
+# Building from source
+
+Faiss can be built from source using CMake.
+
+Faiss is supported on x86-64 machines on Linux, OSX, and Windows. It has been
+found to run on other platforms as well, see
+[other platforms](https://github.com/facebookresearch/faiss/wiki/Related-projects#bindings-to-other-languages-and-porting-to-other-platforms).
+
+The basic requirements are:
+- a C++17 compiler (with support for OpenMP support version 2 or higher),
+- a BLAS implementation (on Intel machines we strongly recommend using Intel MKL for best
+performance).
+
+The optional requirements are:
+- for GPU indices:
+  - nvcc,
+  - the CUDA toolkit,
+- for AMD GPUs:
+  - AMD ROCm,
+- for using NVIDIA cuVS implementations:
+  - libcuvs=24.12
+- for the python bindings:
+  - python 3,
+  - numpy,
+  - and swig.
+
+Indications for specific configurations are available in the [troubleshooting
+section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshooting).
+
+### Building with NVIDIA cuVS
+
+[cuVS](https://docs.rapids.ai/api/cuvs/nightly/) contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It is built on top of the [RAPIDS RAFT](https://github.com/rapidsai/raft) library of high performance machine learning primitives. Building Faiss with cuVS enabled allows a user to choose between regular GPU implementations in Faiss and cuVS implementations for specific algorithms.
+
+The libcuvs dependency should be installed via conda:
+1. With CUDA 12.0 - 12.5:
+```
+conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5'
+```
+2. With CUDA 11.4 - 11.8
+```
+conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=11.4,<=11.8'
+```
+For more ways to install cuVS 24.12, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
+
+## Step 1: invoking CMake
+
+``` shell
+$ cmake -B build .
+```
+
+This generates the system-dependent configuration/build files in the `build/`
+subdirectory.
+
+Several options can be passed to CMake, among which:
+- general options:
+  - `-DFAISS_ENABLE_GPU=OFF` in order to disable building GPU indices (possible
+  values are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
+  (possible values are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_CUVS=ON` in order to use the NVIDIA cuVS implementations
+    of the IVF-Flat, IVF-PQ and [CAGRA](https://arxiv.org/pdf/2308.15136) GPU-accelerated indices (default is `OFF`, possible, values are `ON` and `OFF`).
+    Note: `-DFAISS_ENABLE_GPU` must be set to `ON` when enabling this option.
+  - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
+  - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
+  are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_C_API=ON` in order to enable building [C API](c_api/INSTALL.md) (possible values
+    are `ON` and `OFF`),
+- optimization-related options:
+  - `-DCMAKE_BUILD_TYPE=Release` in order to enable generic compiler
+  optimization options (enables `-O3` on gcc for instance),
+  - `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to
+  generate code using optimized SIMD/Vector instructions. Possible values are below:
+    - On x86-64, `generic`, `avx2`, 'avx512', and `avx512_spr` (for avx512 features available since Intel(R) Sapphire Rapids), by increasing order of optimization,
+    - On aarch64, `generic` and `sve`, by increasing order of optimization,
+  - `-DFAISS_USE_LTO=ON` in order to enable [Link-Time Optimization](https://en.wikipedia.org/wiki/Link-time_optimization) (default is `OFF`, possible values are `ON` and `OFF`).
+- BLAS-related options:
+  - `-DBLA_VENDOR=Intel10_64_dyn -DMKL_LIBRARIES=/path/to/mkl/libs` to use the
+  Intel MKL BLAS implementation, which is significantly faster than OpenBLAS
+  (more information about the values for the `BLA_VENDOR` option can be found in
+  the [CMake docs](https://cmake.org/cmake/help/latest/module/FindBLAS.html)),
+- GPU-related options:
+  - `-DCUDAToolkit_ROOT=/path/to/cuda-10.1` in order to hint to the path of
+  the CUDA toolkit (for more information, see
+  [CMake docs](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)),
+  - `-DCMAKE_CUDA_ARCHITECTURES="75;72"` for specifying which GPU architectures
+  to build against (see [CUDA docs](https://developer.nvidia.com/cuda-gpus) to
+  determine which architecture(s) you should pick),
+  - `-DFAISS_ENABLE_ROCM=ON` in order to enable building GPU indices for AMD GPUs.
+ `-DFAISS_ENABLE_GPU` must be `ON` when using this option. (possible values are `ON` and `OFF`),
+- python-related options:
+  - `-DPython_EXECUTABLE=/path/to/python3.7` in order to build a python
+  interface for a different python than the default one (see
+  [CMake docs](https://cmake.org/cmake/help/latest/module/FindPython.html)).
+
+## Step 2: Invoking Make
+
+``` shell
+$ make -C build -j faiss
+```
+
+This builds the C++ library (`libfaiss.a` by default, and `libfaiss.so` if
+`-DBUILD_SHARED_LIBS=ON` was passed to CMake).
+
+The `-j` option enables parallel compilation of multiple units, leading to a
+faster build, but increasing the chances of running out of memory, in which case
+it is recommended to set the `-j` option to a fixed value (such as `-j4`).
+
+If making use of optimization options, build the correct target before swigfaiss.
+
+For AVX2:
+
+``` shell
+$ make -C build -j faiss_avx2
+```
+
+For AVX512:
+
+``` shell
+$ make -C build -j faiss_avx512
+```
+
+For AVX512 features available since Intel(R) Sapphire Rapids.
+
+``` shell
+$ make -C build -j faiss_avx512_spr
+```
+
+This will ensure the creation of neccesary files when building and installing the python package.
+
+## Step 3: Building the python bindings (optional)
+
+``` shell
+$ make -C build -j swigfaiss
+$ (cd build/faiss/python && python setup.py install)
+```
+
+The first command builds the python bindings for Faiss, while the second one
+generates and installs the python package.
+
+
+## Step 4: Installing the C++ library and headers (optional)
+
+``` shell
+$ make -C build install
+```
+
+This will make the compiled library (either `libfaiss.a` or `libfaiss.so` on
+Linux) available system-wide, as well as the C++ headers. This step is not
+needed to install the python package only.
+
+
+## Step 5: Testing (optional)
+
+### Running the C++ test suite
+
+To run the whole test suite, make sure that `cmake` was invoked with
+`-DBUILD_TESTING=ON`, and run:
+
+``` shell
+$ make -C build test
+```
+
+### Running the python test suite
+
+``` shell
+$ (cd build/faiss/python && python setup.py build)
+$ PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)" pytest tests/test_*.py
+```
+
+### Basic example
+
+A basic usage example is available in
+[`demos/demo_ivfpq_indexing.cpp`](https://github.com/facebookresearch/faiss/blob/main/demos/demo_ivfpq_indexing.cpp).
+
+It creates a small index, stores it and performs some searches. A normal runtime
+is around 20s. With a fast machine and Intel MKL's BLAS it runs in 2.5s.
+
+It can be built with
+``` shell
+$ make -C build demo_ivfpq_indexing
+```
+and subsequently ran with
+``` shell
+$ ./build/demos/demo_ivfpq_indexing
+```
+
+### Basic GPU example
+
+``` shell
+$ make -C build demo_ivfpq_indexing_gpu
+$ ./build/demos/demo_ivfpq_indexing_gpu
+```
+
+This produce the GPU code equivalent to the CPU `demo_ivfpq_indexing`. It also
+shows how to translate indexes from/to a GPU.
+
+### A real-life benchmark
+
+A longer example runs and evaluates Faiss on the SIFT1M dataset. To run it,
+please download the ANN_SIFT1M dataset from http://corpus-texmex.irisa.fr/
+and unzip it to the subdirectory `sift1M` at the root of the source
+directory for this repository.
+
+Then compile and run the following (after ensuring you have installed faiss):
+
+``` shell
+$ make -C build demo_sift1M
+$ ./build/demos/demo_sift1M
+```
+
+This is a demonstration of the high-level auto-tuning API. You can try
+setting a different index_key to find the indexing structure that
+gives the best performance.
+
+### Real-life test
+
+The following script extends the demo_sift1M test to several types of
+indexes. This must be run from the root of the source directory for this
+repository:
+
+``` shell
+$ mkdir tmp  # graphs of the output will be written here
+$ python demos/demo_auto_tune.py
+```
+
+It will cycle through a few types of indexes and find optimal
+operating points. You can play around with the types of indexes.
+
+### Real-life test on GPU
+
+The example above also runs on GPU. Edit `demos/demo_auto_tune.py` at line 100
+with the values
+
+``` python
+keys_to_test = keys_gpu
+use_gpu = True
+```
+
+and you can run
+``` shell
+$ python demos/demo_auto_tune.py
+```
+to test the GPU code.
+
+[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary.
--- a/packages/leann-backend-hnsw/third_party/faiss/LICENSE
+++ b/packages/leann-backend-hnsw/third_party/faiss/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/packages/leann-backend-hnsw/third_party/faiss/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/README.md
@@ -0,0 +1,92 @@
+# Faiss
+
+Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at Meta's [Fundamental AI Research](https://ai.facebook.com/) group.
+
+## News
+
+See [CHANGELOG.md](CHANGELOG.md) for detailed information about latest features.
+
+## Introduction
+
+Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
+
+Some of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. Other methods, like HNSW and NSG add an indexing structure on top of the raw vectors to make searching more efficient.
+
+The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
+
+## Installing
+
+Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu), [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu) and [faiss-gpu-cuvs](https://anaconda.org/pytorch/faiss-gpu-cuvs). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. The backend GPU implementations of NVIDIA [cuVS](https://github.com/rapidsai/cuvs) can also be enabled optionally. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
+
+## How Faiss works
+
+Faiss is built around an index type that stores a set of vectors, and provides a function to search in them with L2 and/or dot product vector comparison. Some index types are simple baselines, such as exact search. Most of the available indexing structures correspond to various trade-offs with respect to
+
+- search time
+- search quality
+- memory used per index vector
+- training time
+- adding time
+- need for external data for unsupervised training
+
+The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
+
+## Full documentation of Faiss
+
+The following are entry points for documentation:
+
+- the full documentation can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting)
+- the [doxygen documentation](https://faiss.ai/) gives per-class information extracted from code comments
+- to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
+Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
+
+## Authors
+
+The main authors of Faiss are:
+- [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
+- [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
+- [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
+- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
+- [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
+- [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
+- [Gergely Szilvasy](https://github.com/algoriddle) build system, benchmarking framework.
+
+## Reference
+
+References to cite when you use Faiss in a research paper:
+```
+@article{douze2024faiss,
+      title={The Faiss library},
+      author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazaré and Maria Lomeli and Lucas Hosseini and Hervé Jégou},
+      year={2024},
+      eprint={2401.08281},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+For the GPU version of Faiss, please cite:
+```
+@article{johnson2019billion,
+  title={Billion-scale similarity search with {GPUs}},
+  author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+  journal={IEEE Transactions on Big Data},
+  volume={7},
+  number={3},
+  pages={535--547},
+  year={2019},
+  publisher={IEEE}
+}
+```
+
+## Join the Faiss community
+
+For public discussion of Faiss or for questions, visit https://github.com/facebookresearch/faiss/discussions.
+
+We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
+You can report bugs, ask questions, etc.
+
+## Legal
+
+Faiss is MIT-licensed, refer to the [LICENSE file](https://github.com/facebookresearch/faiss/blob/main/LICENSE) in the top level directory.
+
+Copyright © Meta Platforms, Inc.
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+
+add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
+target_link_libraries(bench_ivf_selector PRIVATE faiss)
+
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/README.md
@@ -0,0 +1,361 @@
+
+# Benchmarking scripts
+
+This directory contains benchmarking scripts that can reproduce the
+numbers reported in the two papers
+
+```
+@inproceedings{DJP16,
+  Author = {Douze, Matthijs and J{\'e}gou, Herv{\'e} and Perronnin, Florent},
+  Booktitle = "ECCV",
+  Organization = {Springer},
+  Title = {Polysemous codes},
+  Year = {2016}
+}
+```
+and
+
+```
+@inproceedings{JDJ17,
+   Author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
+   journal= {arXiv:1702.08734},,
+   Title = {Billion-scale similarity search with GPUs},
+   Year = {2017},
+}
+```
+
+Note that the numbers (especially timings) change slightly due to changes in the implementation, different machines, etc.
+
+The scripts are self-contained. They depend only on Faiss and external training data that should be stored in sub-directories.
+
+## SIFT1M experiments
+
+The script [`bench_polysemous_sift1m.py`](bench_polysemous_sift1m.py) reproduces the numbers in
+Figure 3 from the "Polysemous" paper.
+
+### Getting SIFT1M
+
+To run it, please download the ANN_SIFT1M dataset from
+
+http://corpus-texmex.irisa.fr/
+
+and unzip it to the subdirectory sift1M.
+
+### Result
+
+The output looks like:
+
+```
+PQ training on 100000 points, remains 0 points: training polysemous on centroids
+add vectors to index
+PQ baseline        7.517 ms per query, R@1 0.4474
+Polysemous 64      9.875 ms per query, R@1 0.4474
+Polysemous 62      8.358 ms per query, R@1 0.4474
+Polysemous 58      5.531 ms per query, R@1 0.4474
+Polysemous 54      3.420 ms per query, R@1 0.4478
+Polysemous 50      2.182 ms per query, R@1 0.4475
+Polysemous 46      1.621 ms per query, R@1 0.4408
+Polysemous 42      1.448 ms per query, R@1 0.4174
+Polysemous 38      1.331 ms per query, R@1 0.3563
+Polysemous 34      1.334 ms per query, R@1 0.2661
+Polysemous 30      1.272 ms per query, R@1 0.1794
+```
+
+
+## Experiments on 1B elements dataset
+
+The script [`bench_polysemous_1bn.py`](bench_polysemous_1bn.py) reproduces a few experiments on
+two datasets of size 1B from the Polysemous codes" paper.
+
+
+### Getting BIGANN
+
+Download the four files of ANN_SIFT1B from
+http://corpus-texmex.irisa.fr/ to subdirectory bigann/
+
+### Getting Deep1B
+
+The ground-truth and queries are available here
+
+https://yadi.sk/d/11eDCm7Dsn9GA
+
+For the learning and database vectors, use the script
+
+https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py
+
+to download the data to subdirectory deep1b/, then concatenate the
+database files to base.fvecs and the training files to learn.fvecs
+
+### Running the experiments
+
+These experiments are quite long. To support resuming, the script
+stores the result of training to a temporary directory, `/tmp/bench_polysemous`.
+
+The script `bench_polysemous_1bn.py` takes at least two arguments:
+
+- the dataset name: SIFT1000M (aka SIFT1B, aka BIGANN) or Deep1B. SIFT1M, SIFT2M,... are also supported to make subsets of for small experiments (note that SIFT1M as a subset of SIFT1B is not the same as the SIFT1M above)
+
+- the type of index to build, which should be a valid [index_factory key](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning#index-factory) (see below for examples)
+
+- the remaining arguments are parsed as search-time parameters.
+
+### Experiments of Table 2
+
+The `IMI*+PolyD+ADC` results in Table 2 can be reproduced with (for 16 bytes):
+
+```
+python bench_polysemous_1bn.par SIFT1000M IMI2x12,PQ16 nprobe=16,max_codes={10000,30000},ht={44..54}
+```
+
+Training takes about 2 minutes and adding vectors to the dataset
+takes 3.1 h. These operations are multithreaded. Note that in the command
+above, we use bash's [brace expansion](https://www.gnu.org/software/bash/manual/html_node/Brace-Expansion.html) to set a grid of parameters.
+
+The search is *not* multithreaded, and the output looks like:
+
+```
+                                        R@1    R@10   R@100     time    %pass
+nprobe=16,max_codes=10000,ht=44         0.1779 0.2994 0.3139    0.194   12.45
+nprobe=16,max_codes=10000,ht=45         0.1859 0.3183 0.3339    0.197   14.24
+nprobe=16,max_codes=10000,ht=46         0.1930 0.3366 0.3543    0.202   16.22
+nprobe=16,max_codes=10000,ht=47         0.1993 0.3550 0.3745    0.209   18.39
+nprobe=16,max_codes=10000,ht=48         0.2033 0.3694 0.3917    0.640   20.77
+nprobe=16,max_codes=10000,ht=49         0.2070 0.3839 0.4077    0.229   23.36
+nprobe=16,max_codes=10000,ht=50         0.2101 0.3949 0.4205    0.232   26.17
+nprobe=16,max_codes=10000,ht=51         0.2120 0.4042 0.4310    0.239   29.21
+nprobe=16,max_codes=10000,ht=52         0.2134 0.4113 0.4402    0.245   32.47
+nprobe=16,max_codes=10000,ht=53         0.2157 0.4184 0.4482    0.250   35.96
+nprobe=16,max_codes=10000,ht=54         0.2170 0.4240 0.4546    0.256   39.66
+nprobe=16,max_codes=30000,ht=44         0.1882 0.3327 0.3555    0.226   11.29
+nprobe=16,max_codes=30000,ht=45         0.1964 0.3525 0.3771    0.231   13.05
+nprobe=16,max_codes=30000,ht=46         0.2039 0.3713 0.3987    0.236   15.01
+nprobe=16,max_codes=30000,ht=47         0.2103 0.3907 0.4202    0.245   17.19
+nprobe=16,max_codes=30000,ht=48         0.2145 0.4055 0.4384    0.251   19.60
+nprobe=16,max_codes=30000,ht=49         0.2179 0.4198 0.4550    0.257   22.25
+nprobe=16,max_codes=30000,ht=50         0.2208 0.4305 0.4681    0.268   25.15
+nprobe=16,max_codes=30000,ht=51         0.2227 0.4402 0.4791    0.275   28.30
+nprobe=16,max_codes=30000,ht=52         0.2241 0.4473 0.4884    0.284   31.70
+nprobe=16,max_codes=30000,ht=53         0.2265 0.4544 0.4965    0.294   35.34
+nprobe=16,max_codes=30000,ht=54         0.2278 0.4601 0.5031    0.303   39.20
+```
+
+The result reported in table 2 is the one for which the %pass (percentage of code comparisons that pass the Hamming check) is around 20%, which occurs for Hamming threshold `ht=48`.
+
+The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
+
+### Experiments of the appendix
+
+The experiments in the appendix are only in the ArXiv version of the paper (table 3).
+
+```
+python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
+
+               	R@1    R@10   R@100     time    %pass
+nprobe=1,ht=20 	0.0351 0.0616 0.0751    0.158   19.01
+...
+nprobe=32,ht=28 	0.1256 0.3563 0.5026    0.561   52.61
+...
+```
+Here again the runs are not exactly the same but the original result was obtained from nprobe=32,ht=28.
+
+For Deep1B, we used a simple version of [auto-tuning](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning/_edit#auto-tuning-the-runtime-parameters) to sweep through the set of operating points:
+
+```
+python bench_polysemous_1bn.py Deep1B OPQ20_80,IMI2x14,PQ20 autotune
+...
+Done in 4067.555 s, available OPs:
+Parameters                                1-R@1     time
+                                          0.0000    0.000
+nprobe=1,ht=22,max_codes=256              0.0215    3.115
+nprobe=1,ht=30,max_codes=256              0.0381    3.120
+...
+nprobe=512,ht=68,max_codes=524288         0.4478   36.903
+nprobe=1024,ht=80,max_codes=131072        0.4557   46.363
+nprobe=1024,ht=78,max_codes=262144        0.4616   61.939
+...
+```
+The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
+
+
+## GPU experiments
+
+The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss.
+
+### Search on SIFT1M
+
+See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers.
+
+The output is:
+```
+============ Exact search
+add vectors to index
+warmup
+benchmark
+k=1 0.715 s, R@1 0.9914
+k=2 0.729 s, R@1 0.9935
+k=4 0.731 s, R@1 0.9935
+k=8 0.732 s, R@1 0.9935
+k=16 0.742 s, R@1 0.9935
+k=32 0.737 s, R@1 0.9935
+k=64 0.753 s, R@1 0.9935
+k=128 0.761 s, R@1 0.9935
+k=256 0.799 s, R@1 0.9935
+k=512 0.975 s, R@1 0.9935
+k=1024 1.424 s, R@1 0.9935
+============ Approximate search
+train
+WARNING clustering 100000 points to 4096 centroids: please provide at least 159744 training points
+add vectors to index
+WARN: increase temp memory to avoid cudaMalloc, or decrease query/add size (alloc 256000000 B, highwater 256000000 B)
+warmup
+benchmark
+nprobe=   1 0.043 s recalls= 0.3909 0.4312 0.4312
+nprobe=   2 0.040 s recalls= 0.5041 0.5636 0.5636
+nprobe=   4 0.048 s recalls= 0.6048 0.6897 0.6897
+nprobe=   8 0.064 s recalls= 0.6879 0.8028 0.8028
+nprobe=  16 0.088 s recalls= 0.7534 0.8940 0.8940
+nprobe=  32 0.134 s recalls= 0.7957 0.9549 0.9550
+nprobe=  64 0.224 s recalls= 0.8125 0.9833 0.9834
+nprobe= 128 0.395 s recalls= 0.8205 0.9953 0.9954
+nprobe= 256 0.717 s recalls= 0.8227 0.9993 0.9994
+nprobe= 512 1.348 s recalls= 0.8228 0.9999 1.0000
+```
+The run produces two warnings:
+
+- the clustering complains that it does not have enough training data, there is not much we can do about this.
+
+- the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways.
+
+To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives:
+
+```
+nprobe=   1 0.025 s recalls= 0.4084 0.4105 0.4105
+nprobe=   2 0.033 s recalls= 0.5235 0.5264 0.5264
+nprobe=   4 0.033 s recalls= 0.6332 0.6367 0.6367
+nprobe=   8 0.040 s recalls= 0.7358 0.7403 0.7403
+nprobe=  16 0.049 s recalls= 0.8273 0.8324 0.8324
+nprobe=  32 0.068 s recalls= 0.8957 0.9024 0.9024
+nprobe=  64 0.104 s recalls= 0.9477 0.9549 0.9549
+nprobe= 128 0.174 s recalls= 0.9760 0.9837 0.9837
+nprobe= 256 0.299 s recalls= 0.9866 0.9944 0.9944
+nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
+```
+
+### Clustering on MNIST8m
+
+To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
+
+The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output:
+
+```
+python kmeans_mnist.py 1 256
+...
+Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
+  Preprocessing in 7.94526 s
+  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0
+final objective: 1.449e+13
+total runtime: 140.615 s
+```
+
+### search on SIFT1B
+
+The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the two 1-billion vector datasets we considered. It is more complex than the previous scripts, because it supports many search options and decomposes the dataset build process in Python to exploit the best possible CPU/GPU parallelism and GPU distribution.
+
+Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
+
+The search results on SIFT1B in the "GPU paper" can be obtained with
+
+<!-- see P57124181 -->
+
+```
+python bench_gpu_1bn.py SIFT1000M OPQ8_32,IVF262144,PQ8 -nnn 10 -ngpu 1 -tempmem $[1536*1024*1024]
+...
+0/10000 (0.024 s)      probe=1  : 0.161 s 1-R@1: 0.0752 1-R@10: 0.1924
+0/10000 (0.005 s)      probe=2  : 0.150 s 1-R@1: 0.0964 1-R@10: 0.2693
+0/10000 (0.005 s)      probe=4  : 0.153 s 1-R@1: 0.1102 1-R@10: 0.3328
+0/10000 (0.005 s)      probe=8  : 0.170 s 1-R@1: 0.1220 1-R@10: 0.3827
+0/10000 (0.005 s)      probe=16 : 0.196 s 1-R@1: 0.1290 1-R@10: 0.4151
+0/10000 (0.006 s)      probe=32 : 0.244 s 1-R@1: 0.1314 1-R@10: 0.4345
+0/10000 (0.006 s)      probe=64 : 0.353 s 1-R@1: 0.1332 1-R@10: 0.4461
+0/10000 (0.005 s)      probe=128: 0.587 s 1-R@1: 0.1341 1-R@10: 0.4502
+0/10000 (0.006 s)      probe=256: 1.160 s 1-R@1: 0.1342 1-R@10: 0.4511
+```
+
+We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G, otherwise the dataset does not fit in GPU memory
+
+### search on Deep1B
+
+The same script generates the GPU search results on Deep1B.
+
+```
+python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
+...
+
+0/10000 (0.115 s)      probe=1  : 0.239 s 1-R@1: 0.2387 1-R@10: 0.3420
+0/10000 (0.006 s)      probe=2  : 0.103 s 1-R@1: 0.3110 1-R@10: 0.4623
+0/10000 (0.005 s)      probe=4  : 0.105 s 1-R@1: 0.3772 1-R@10: 0.5862
+0/10000 (0.005 s)      probe=8  : 0.116 s 1-R@1: 0.4235 1-R@10: 0.6889
+0/10000 (0.005 s)      probe=16 : 0.133 s 1-R@1: 0.4517 1-R@10: 0.7693
+0/10000 (0.005 s)      probe=32 : 0.168 s 1-R@1: 0.4713 1-R@10: 0.8281
+0/10000 (0.005 s)      probe=64 : 0.238 s 1-R@1: 0.4841 1-R@10: 0.8649
+0/10000 (0.007 s)      probe=128: 0.384 s 1-R@1: 0.4900 1-R@10: 0.8816
+0/10000 (0.005 s)      probe=256: 0.736 s 1-R@1: 0.4933 1-R@10: 0.8912
+```
+
+Here we are a bit tight on memory so we disable precomputed tables (`-noptables`) and restrict the amount of temporary memory. The `-altadd` option avoids GPU memory overflows during add.
+
+
+### knn-graph on Deep1B
+
+The same script generates the KNN-graph on Deep1B. Note that the inverted file from above will not be re-used because the training sets are different. For the knngraph, the script will first do a pass over the whole dataset to compute the ground-truth knn for a subset of 10k nodes, for evaluation.
+
+```
+python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -altadd -knngraph  -R 2 -noptables -tempmem $[1<<30] -ngpu 4
+...
+CPU index contains 1000000000 vectors, move to GPU
+Copy CPU index to 2 sharded GPU indexes
+   dispatch to GPUs 0:2
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+   dispatch to GPUs 2:4
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+move to GPU done in 151.535 s
+search...
+999997440/1000000000 (8389.961 s, 0.3379)      probe=1  : 8389.990 s rank-10 intersection results: 0.3379
+999997440/1000000000 (9205.934 s, 0.4079)      probe=2  : 9205.966 s rank-10 intersection results: 0.4079
+999997440/1000000000 (9741.095 s, 0.4722)      probe=4  : 9741.128 s rank-10 intersection results: 0.4722
+999997440/1000000000 (10830.420 s, 0.5256)      probe=8  : 10830.455 s rank-10 intersection results: 0.5256
+999997440/1000000000 (12531.716 s, 0.5603)      probe=16 : 12531.758 s rank-10 intersection results: 0.5603
+999997440/1000000000 (15922.519 s, 0.5825)      probe=32 : 15922.571 s rank-10 intersection results: 0.5825
+999997440/1000000000 (22774.153 s, 0.5950)      probe=64 : 22774.220 s rank-10 intersection results: 0.5950
+999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
+999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
+```
+
+# Additional benchmarks
+
+This directory also contains certain additional benchmarks (and serve as an additional source of examples of how to use the Faiss code).
+Certain tests / benchmarks might be outdated.
+
+* bench_6bit_codec.cpp - tests vector codecs for SQ6 quantization on a synthetic dataset
+* bench_cppcontrib_sa_decode.cpp - benchmarks specialized kernels for vector codecs for PQ, IVFPQ and Resudial+PQ on a synthetic dataset
+* bench_for_interrupt.py - evaluates the impact of the interrupt callback handler (which can be triggered from Python code)
+* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
+* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
+* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
+* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
+* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
+* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
+* bench_ivf_fastscan.py - compares IVF{nlist},PQ{M}x4fs against other indices on SIFT1M dataset
+* bench_ivf_selector.cpp - checks the possible overhead when using faiss::IDSelectorAll interface
+* bench_pairwise_distances.py - benchmarks pairwise distance computation between two synthetic datasets
+* bench_partition.py - benchmarks partitioning functions
+* bench_pq_tables.py - benchmarks ProductQuantizer.compute_inner_prod_tables() and ProductQuantizer.compute_distance_tables() calls
+* bench_quantizer.py - benchmarks various quantizers for SIFT1M, Deep1B, BigANN datasets
+* bench_scalar_quantizer.py - benchmarks IVF+SQ on a Sift1M dataset
+* bench_vector_ops.py - benchmarks dot product and distances computations on a synthetic dataset
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_6bit_codec.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_6bit_codec.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+#include <cstdio>
+
+#include <benchmark/benchmark.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+using namespace faiss;
+
+static void bench(benchmark::State& state) {
+    int d = 128;
+    int n = 2000;
+    state.SetLabel(faiss::get_compile_options());
+
+    std::vector<float> x(d * n);
+
+    float_rand(x.data(), d * n, 12345);
+
+    // make sure it's idempotent
+    ScalarQuantizer sq(d, ScalarQuantizer::QT_6bit);
+
+    omp_set_num_threads(1);
+
+    sq.train(n, x.data());
+
+    size_t code_size = sq.code_size;
+    state.counters["code_size"] = sq.code_size;
+
+    // encode
+    std::vector<uint8_t> codes(code_size * n);
+    sq.compute_codes(x.data(), codes.data(), n);
+
+    // decode
+    std::vector<float> x2(d * n);
+    sq.decode(codes.data(), x2.data(), n);
+
+    state.counters["sql2_recons_error"] =
+            fvec_L2sqr(x.data(), x2.data(), n * d) / n;
+
+    // encode again
+    std::vector<uint8_t> codes2(code_size * n);
+    sq.compute_codes(x2.data(), codes2.data(), n);
+
+    size_t ndiff = 0;
+    for (size_t i = 0; i < codes.size(); i++) {
+        if (codes[i] != codes2[i])
+            ndiff++;
+    }
+
+    state.counters["ndiff_for_idempotence"] = ndiff;
+
+    state.counters["code_size_two"] = codes.size();
+
+    std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
+            sq.get_distance_computer());
+    dc->codes = codes.data();
+    dc->code_size = sq.code_size;
+    state.counters["code_size_three"] = dc->code_size;
+
+    for (auto _ : state) {
+        float sum_dis = 0;
+        for (int i = 0; i < n; i++) {
+            dc->set_query(&x[i * d]);
+            for (int j = 0; j < n; j++) {
+                benchmark::DoNotOptimize(sum_dis += (*dc)(j));
+            }
+        }
+    }
+}
+// I think maybe n and d should be input arguments
+// for things to really make sense, idk.
+BENCHMARK(bench)->Iterations(20);
+BENCHMARK_MAIN();
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
@@ -0,0 +1,20 @@
+# Benchmark of IVF variants
+
+This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
+The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
+
+
+The code is organized as: 
+
+- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
+
+- `bench_all_ivf.py`: evaluate one type of inverted file
+
+- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
+Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
+
+- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
+
+The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
+
+It was run in October 2018 for the results in the wiki. 
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@@ -0,0 +1,567 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import sys
+import time
+import json
+
+import faiss
+import numpy as np
+
+try:
+    import datasets_fb as datasets
+except ModuleNotFoundError:
+    import datasets_oss as datasets
+
+sanitize = datasets.sanitize
+
+
+
+def unwind_index_ivf(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        assert index.chain.size() == 1
+        vt = index.chain.at(0)
+        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
+        assert vt2 is None
+        if vt is None:
+            vt = lambda x: x
+        else:
+            vt = faiss.downcast_VectorTransform(vt)
+        return index_ivf, vt
+    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
+        return unwind_index_ivf(faiss.downcast_index(index.base_index))
+    if isinstance(index, faiss.IndexIVF):
+        return index, None
+    else:
+        return None, None
+
+
+def apply_AQ_options(index, args):
+    # if not(
+    #    isinstance(index, faiss.IndexAdditiveQuantize) or
+    #    isinstance(index, faiss.IndexIVFAdditiveQuantizer)):
+    #    return
+    if args.RQ_train_default:
+        print("set default training for RQ")
+        index.rq.train_type
+        index.rq.train_type = faiss.ResidualQuantizer.Train_default
+    if args.RQ_beam_size != -1:
+        print("set RQ beam size to", args.RQ_beam_size)
+        index.rq.max_beam_size
+        index.rq.max_beam_size = args.RQ_beam_size
+    if args.LSQ_encode_ils_iters != -1:
+        print("set LSQ ils iterations to", args.LSQ_encode_ils_iters)
+        index.lsq.encode_ils_iters
+        index.lsq.encode_ils_iters = args.LSQ_encode_ils_iters
+    if args.RQ_use_beam_LUT != -1:
+        print("set RQ beam LUT to", args.RQ_use_beam_LUT)
+        index.rq.use_beam_LUT
+        index.rq.use_beam_LUT = args.RQ_use_beam_LUT
+
+
+
+def eval_setting(index, xq, gt, k, inter, min_time):
+    """ evaluate searching in terms of precision vs. speed """
+    nq = xq.shape[0]
+    ivf_stats = faiss.cvar.indexIVF_stats
+    ivf_stats.reset()
+    nrun = 0
+    t0 = time.time()
+    while True:
+        D, I = index.search(xq, k)
+        nrun += 1
+        t1 = time.time()
+        if t1 - t0 > min_time:
+            break
+    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
+    res = {
+        "ms_per_query": ms_per_query,
+        "nrun": nrun
+    }
+    res["n"] = ms_per_query
+    if inter:
+        rank = k
+        inter_measure = faiss.eval_intersection(gt[:, :rank], I[:, :rank]) / (nq * rank)
+        print("%.4f" % inter_measure, end=' ')
+        res["inter_measure"] = inter_measure
+    else:
+        res["recalls"] = {}
+        for rank in 1, 10, 100:
+            recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+            print("%.4f" % recall, end=' ')
+            res["recalls"][rank] = recall
+    print("   %9.5f  " % ms_per_query, end=' ')
+    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
+    print(nrun)
+    res["ndis"] = ivf_stats.ndis / nrun
+    return res
+
+######################################################
+# Training
+######################################################
+
+def run_train(args, ds, res):
+    nq, d = ds.nq, ds.d
+    nb, d = ds.nq, ds.d
+
+    print("build index, key=", args.indexkey)
+
+    index = faiss.index_factory(
+        d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else
+        faiss.METRIC_INNER_PRODUCT
+    )
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    if args.by_residual != -1:
+        by_residual = args.by_residual == 1
+        print("setting by_residual = ", by_residual)
+        index_ivf.by_residual   # check if field exists
+        index_ivf.by_residual = by_residual
+
+    if index_ivf:
+        print("Update add-time parameters")
+        # adjust default parameters used at add time for quantizers
+        # because otherwise the assignment is inaccurate
+        quantizer = faiss.downcast_index(index_ivf.quantizer)
+        if isinstance(quantizer, faiss.IndexRefine):
+            print("   update quantizer k_factor=", quantizer.k_factor, end=" -> ")
+            quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
+            print(quantizer.k_factor)
+            base_index = faiss.downcast_index(quantizer.base_index)
+            if isinstance(base_index, faiss.IndexIVF):
+                print("   update quantizer nprobe=", base_index.nprobe, end=" -> ")
+                base_index.nprobe = (
+                    16 if base_index.nlist < 1e5 else
+                    32 if base_index.nlist < 4e6 else
+                    64)
+                print(base_index.nprobe)
+        elif isinstance(quantizer, faiss.IndexHNSW):
+            hnsw = quantizer.hnsw
+            print(
+                f"   update HNSW quantizer options, before: "
+                f"{hnsw.efSearch=:} {hnsw.efConstruction=:}"
+            )
+            hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
+            hnsw.efConstruction = 200
+            print(f"       after: {hnsw.efSearch=:} {hnsw.efConstruction=:}")
+
+    apply_AQ_options(index_ivf or index, args)
+
+    if index_ivf:
+        index_ivf.verbose = True
+        index_ivf.quantizer.verbose = True
+        index_ivf.cp.verbose = True
+    else:
+        index.verbose = True
+
+    maxtrain = args.maxtrain
+    if maxtrain == 0:
+        if 'IMI' in args.indexkey:
+            maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
+        elif index_ivf:
+            maxtrain = 50 * index_ivf.nlist
+        else:
+            # just guess...
+            maxtrain = 256 * 100
+        maxtrain = max(maxtrain, 256 * 100)
+        print("setting maxtrain to %d" % maxtrain)
+
+    try:
+        xt2 = ds.get_train(maxtrain=maxtrain)
+    except NotImplementedError:
+        print("No training set: training on database")
+        xt2 = ds.get_database()[:maxtrain]
+
+    print("train, size", xt2.shape)
+    assert np.all(np.isfinite(xt2))
+
+    if (isinstance(vec_transform, faiss.OPQMatrix) and
+        isinstance(index_ivf, faiss.IndexIVFPQFastScan)):
+        print("  Forcing OPQ training PQ to PQ4")
+        ref_pq = index_ivf.pq
+        training_pq = faiss.ProductQuantizer(
+            ref_pq.d, ref_pq.M, ref_pq.nbits
+        )
+        vec_transform.pq
+        vec_transform.pq = training_pq
+
+
+    if args.get_centroids_from == '':
+
+        if args.clustering_niter >= 0:
+            print(("setting nb of clustering iterations to %d" %
+                   args.clustering_niter))
+            index_ivf.cp.niter = args.clustering_niter
+
+        if args.train_on_gpu:
+            print("add a training index on GPU")
+            train_index = faiss.index_cpu_to_all_gpus(
+                    faiss.IndexFlatL2(index_ivf.d))
+            index_ivf.clustering_index = train_index
+
+    else:
+        print("Getting centroids from", args.get_centroids_from)
+        src_index = faiss.read_index(args.get_centroids_from)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = src_quant.reconstruct_n()
+        print("  centroid table shape", centroids.shape)
+
+        if isinstance(vec_transform, faiss.VectorTransform):
+            print("  training vector transform")
+            vec_transform.train(xt2)
+            print("  transform centroids")
+            centroids = vec_transform.apply_py(centroids)
+
+        if not index_ivf.quantizer.is_trained:
+            print("  training quantizer")
+            index_ivf.quantizer.train(centroids)
+
+        print("  add centroids to quantizer")
+        index_ivf.quantizer.add(centroids)
+        del src_index
+
+    t0 = time.time()
+    index.train(xt2)
+    res.train_time = time.time() - t0
+    print("  train in %.3f s" % res.train_time)
+    return index
+
+######################################################
+# Populating index
+######################################################
+
+def run_add(args, ds, index, res):
+
+    print("adding")
+    t0 = time.time()
+    if args.add_bs == -1:
+        assert args.split == [1, 0], "split not supported with full batch add"
+        index.add(sanitize(ds.get_database()))
+    else:
+        totn = ds.nb // args.split[0] # approximate
+        i0 = 0
+        print(f"Adding in block sizes {args.add_bs} with split {args.split}")
+        for xblock in ds.database_iterator(bs=args.add_bs, split=args.split):
+            i1 = i0 + len(xblock)
+            print("  adding %d:%d / %d [%.3f s, RSS %d kiB] " % (
+                i0, i1, totn, time.time() - t0,
+                faiss.get_mem_usage_kb()))
+            index.add(xblock)
+            i0 = i1
+
+    res.t_add = time.time() - t0
+    print(f"  add in {res.t_add:.3f} s index size {index.ntotal}")
+
+
+######################################################
+# Search
+######################################################
+
+def run_search(args, ds, index, res):
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    if args.no_precomputed_tables:
+        if isinstance(index_ivf, faiss.IndexIVFPQ):
+            print("disabling precomputed table")
+            index_ivf.use_precomputed_table = -1
+            index_ivf.precomputed_table.clear()
+
+    if args.indexfile:
+        print("index size on disk: ", os.stat(args.indexfile).st_size)
+
+    if hasattr(index, "code_size"):
+        print("vector code_size", index.code_size)
+
+    if hasattr(index_ivf, "code_size"):
+        print("vector code_size (IVF)", index_ivf.code_size)
+
+    print("current RSS:", faiss.get_mem_usage_kb() * 1024)
+
+    precomputed_table_size = 0
+    if hasattr(index_ivf, 'precomputed_table'):
+        precomputed_table_size = index_ivf.precomputed_table.size() * 4
+
+    print("precomputed tables size:", precomputed_table_size)
+
+    # Index is ready
+
+    xq = sanitize(ds.get_queries())
+    nq, d = xq.shape
+    gt = ds.get_groundtruth(k=args.k)
+
+    if not args.accept_short_gt: # Deep1B has only a single NN per query
+        assert gt.shape[1] == args.k
+
+    if args.searchthreads != -1:
+        print("Setting nb of threads to", args.searchthreads)
+        faiss.omp_set_num_threads(args.searchthreads)
+    else:
+        print("nb search threads: ", faiss.omp_get_max_threads())
+
+    ps = faiss.ParameterSpace()
+    ps.initialize(index)
+
+    parametersets = args.searchparams
+
+    if args.inter:
+        header = (
+            '%-40s     inter@%3d time(ms/q)   nb distances #runs' %
+            ("parameters", args.k)
+        )
+    else:
+
+        header = (
+            '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' %
+            "parameters"
+        )
+
+
+    res.search_results = {}
+    if parametersets == ['autotune']:
+
+        ps.n_experiments = args.n_autotune
+        ps.min_test_duration = args.min_test_duration
+
+        for kv in args.autotune_max:
+            k, vmax = kv.split(':')
+            vmax = float(vmax)
+            print("limiting %s to %g" % (k, vmax))
+            pr = ps.add_range(k)
+            values = faiss.vector_to_array(pr.values)
+            values = np.array([v for v in values if v < vmax])
+            faiss.copy_array_to_vector(values, pr.values)
+
+        for kv in args.autotune_range:
+            k, vals = kv.split(':')
+            vals = np.fromstring(vals, sep=',')
+            print("setting %s to %s" % (k, vals))
+            pr = ps.add_range(k)
+            faiss.copy_array_to_vector(vals, pr.values)
+
+        # setup the Criterion object
+        if args.inter:
+            print("Optimize for intersection @ ", args.k)
+            crit = faiss.IntersectionCriterion(nq, args.k)
+        else:
+            print("Optimize for 1-recall @ 1")
+            crit = faiss.OneRecallAtRCriterion(nq, 1)
+
+        # by default, the criterion will request only 1 NN
+        crit.nnn = args.k
+        crit.set_groundtruth(None, gt.astype('int64'))
+
+        # then we let Faiss find the optimal parameters by itself
+        print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
+        ps.display()
+
+        t0 = time.time()
+        op = ps.explore(index, xq, crit)
+        res.t_explore = time.time() - t0
+        print("Done in %.3f s, available OPs:" % res.t_explore)
+
+        op.display()
+
+        print("Re-running evaluation on selected OPs")
+        print(header)
+        opv = op.optimal_pts
+        maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
+        for i in range(opv.size()):
+            opt = opv.at(i)
+
+            ps.set_index_parameters(index, opt.key)
+
+            print(opt.key.ljust(maxw), end=' ')
+            sys.stdout.flush()
+
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[opt.key] = res_i
+
+    else:
+        print(header)
+        for param in parametersets:
+            print("%-40s " % param, end=' ')
+            sys.stdout.flush()
+            ps.set_index_parameters(index, param)
+
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[param] = res_i
+
+
+
+######################################################
+# Driver function
+######################################################
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('general options')
+    aa('--nthreads', default=-1, type=int,
+        help='nb of threads to use at train and add time')
+    aa('--json', default=False, action="store_true",
+        help="output stats in JSON format at the end")
+    aa('--todo', default=["check_files"],
+       choices=["train", "add", "search", "check_files"],
+       nargs="+", help='what to do (check_files means decide depending on which index files exist)')
+
+    group = parser.add_argument_group('dataset options')
+    aa('--db', default='deep1M', help='dataset')
+    aa('--compute_gt', default=False, action='store_true',
+        help='compute and store the groundtruth')
+    aa('--force_IP', default=False, action="store_true",
+        help='force IP search instead of L2')
+    aa('--accept_short_gt', default=False, action='store_true',
+        help='work around a problem with Deep1B GT')
+
+    group = parser.add_argument_group('index construction')
+    aa('--indexkey', default='HNSW32', help='index_factory type')
+    aa('--trained_indexfile', default='',
+       help='file to read or write a trained index from')
+    aa('--maxtrain', default=256 * 256, type=int,
+        help='maximum number of training points (0 to set automatically)')
+    aa('--indexfile', default='', help='file to read or write index from')
+    aa('--split', default=[1, 0], type=int, nargs=2, help="database split")
+    aa('--add_bs', default=-1, type=int,
+        help='add elements index by batches of this size')
+
+    group = parser.add_argument_group('IVF options')
+    aa('--by_residual', default=-1, type=int,
+        help="set if index should use residuals (default=unchanged)")
+    aa('--no_precomputed_tables', action='store_true', default=False,
+        help='disable precomputed tables (uses less memory)')
+    aa('--get_centroids_from', default='',
+        help='get the centroids from this index (to speed up training)')
+    aa('--clustering_niter', default=-1, type=int,
+        help='number of clustering iterations (-1 = leave default)')
+    aa('--train_on_gpu', default=False, action='store_true',
+        help='do training on GPU')
+
+    group = parser.add_argument_group('index-specific options')
+    aa('--M0', default=-1, type=int, help='size of base level for HNSW')
+    aa('--RQ_train_default', default=False, action="store_true",
+        help='disable progressive dim training for RQ')
+    aa('--RQ_beam_size', default=-1, type=int,
+        help='set beam size at add time')
+    aa('--LSQ_encode_ils_iters', default=-1, type=int,
+        help='ILS iterations for LSQ')
+    aa('--RQ_use_beam_LUT', default=-1, type=int,
+        help='use beam LUT at add time')
+
+    group = parser.add_argument_group('searching')
+    aa('--k', default=100, type=int, help='nb of nearest neighbors')
+    aa('--inter', default=False, action='store_true',
+        help='use intersection measure instead of 1-recall as metric')
+    aa('--searchthreads', default=-1, type=int,
+        help='nb of threads to use at search time')
+    aa('--searchparams', nargs='+', default=['autotune'],
+        help="search parameters to use (can be autotune or a list of params)")
+    aa('--n_autotune', default=500, type=int,
+        help="max nb of autotune experiments")
+    aa('--autotune_max', default=[], nargs='*',
+        help='set max value for autotune variables format "var:val" (exclusive)')
+    aa('--autotune_range', default=[], nargs='*',
+        help='set complete autotune range, format "var:val1,val2,..."')
+    aa('--min_test_duration', default=3.0, type=float,
+        help='run test at least for so long to avoid jitter')
+    aa('--indexes_to_merge', default=[], nargs="*",
+        help="load these indexes to search and merge them before searching")
+
+    args = parser.parse_args()
+
+    if args.todo == ["check_files"]:
+        if os.path.exists(args.indexfile):
+            args.todo = ["search"]
+        elif os.path.exists(args.trained_indexfile):
+            args.todo = ["add", "search"]
+        else:
+            args.todo = ["train", "add", "search"]
+        print("setting todo to", args.todo)
+
+    print("args:", args)
+
+    os.system('echo -n "nb processors "; '
+            'cat /proc/cpuinfo | grep ^processor | wc -l; '
+            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+    # object to collect results
+    res = argparse.Namespace()
+    res.args = args.__dict__
+
+    res.cpu_model = [
+        l for l in open("/proc/cpuinfo", "r")
+        if "model name" in l][0]
+
+    print("Load dataset")
+
+    ds = datasets.load_dataset(
+        dataset=args.db, compute_gt=args.compute_gt)
+
+    if args.force_IP:
+        ds.metric = "IP"
+
+    print(ds)
+
+    if args.nthreads != -1:
+        print("Set nb of threads to", args.nthreads)
+        faiss.omp_set_num_threads(args.nthreads)
+    else:
+        print("nb threads: ", faiss.omp_get_max_threads())
+
+    index = None
+    if "train" in args.todo:
+        print("================== Training index")
+        index = run_train(args, ds, res)
+        if args.trained_indexfile:
+            print("storing trained index", args.trained_indexfile)
+            faiss.write_index(index, args.trained_indexfile)
+
+    if "add" in args.todo:
+        if not index:
+            assert args.trained_indexfile
+            print("reading trained index", args.trained_indexfile)
+            index = faiss.read_index(args.trained_indexfile)
+
+        print("================== Adding vectors to index")
+        run_add(args, ds, index, res)
+        if args.indexfile:
+            print("storing", args.indexfile)
+            faiss.write_index(index, args.indexfile)
+
+    if "search" in args.todo:
+        if not index:
+            if args.indexfile:
+                print("reading index", args.indexfile)
+                index = faiss.read_index(args.indexfile)
+            elif args.indexes_to_merge:
+                print(f"Merging {len(args.indexes_to_merge)} indexes")
+                sz = 0
+                for fname in args.indexes_to_merge:
+                    print(f"    reading {fname} (current size {sz})")
+                    index_i = faiss.read_index(fname)
+                    if index is None:
+                        index = index_i
+                    else:
+                        index.merge_from(index_i, index.ntotal)
+                    sz = index.ntotal
+            else:
+                assert False, "provide --indexfile"
+
+        print("================== Searching")
+        run_search(args, ds, index, res)
+
+    if args.json:
+        print("JSON results:", json.dumps(res.__dict__))
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--nt', default=65536, type=int)
+aa('--nb', default=100000, type=int)
+aa('--nt_sample', default=0, type=int)
+
+group = parser.add_argument_group('kmeans options')
+aa('--k', default=256, type=int)
+aa('--seed', default=12345, type=int)
+aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
+aa('--niter', default=25, type=int)
+aa('--eval_freq', default=100, type=int)
+
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+ngpu = faiss.get_num_gpus()
+print("nb GPUs:", ngpu)
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(dataset=args.db)
+
+
+if args.nt_sample == 0:
+    xt_pca = xt[args.nt:args.nt + 10000]
+    xt = xt[:args.nt]
+else:
+    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
+    rs = np.random.RandomState(args.seed)
+    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
+    xt = xt[idx]
+
+xb = xb[:args.nb]
+
+d = xb.shape[1]
+
+if args.pcadim != -1:
+    print("training PCA: %d -> %d" % (d, args.pcadim))
+    pca = faiss.PCAMatrix(d, args.pcadim)
+    pca.train(sanitize(xt_pca))
+    xt = pca.apply_py(sanitize(xt))
+    xb = pca.apply_py(sanitize(xb))
+    d = xb.shape[1]
+
+
+######################################################
+# Run clustering
+######################################################
+
+
+index = faiss.IndexFlatL2(d)
+
+if ngpu > 0:
+    print("moving index to GPU")
+    index = faiss.index_cpu_to_all_gpus(index)
+
+
+clustering = faiss.Clustering(d, args.k)
+
+clustering.verbose = True
+clustering.seed = args.seed
+clustering.max_points_per_centroid = 10**6
+clustering.min_points_per_centroid = 1
+
+centroids = None
+
+for iter0 in range(0, args.niter, args.eval_freq):
+    iter1 = min(args.niter, iter0 + args.eval_freq)
+    clustering.niter = iter1 - iter0
+
+    if iter0 > 0:
+        faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
+
+    clustering.train(sanitize(xt), index)
+    index.reset()
+    centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
+    index.add(centroids)
+
+    _, I = index.search(sanitize(xb), 1)
+
+    error = ((xb - centroids[I.ravel()]) ** 2).sum()
+
+    print("iter1=%d quantization error on test: %.4f" % (iter1, error))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
@@ -0,0 +1,307 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import sys
+import os
+import argparse
+
+import numpy as np
+
+
+def eval_recalls(name, I, gt, times):
+    k = I.shape[1]
+    s = "%-40s recall" % name
+    nq = len(gt)
+    for rank in 1, 10, 100, 1000:
+        if rank > k:
+            break
+        recall = (I[:, :rank] == gt[:, :1]).sum() / nq
+        s += "@%d: %.4f " % (rank, recall)
+    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
+    print(s)
+
+def eval_inters(name, I, gt, times):
+    k = I.shape[1]
+    s = "%-40s inter" % name
+    nq = len(gt)
+    for rank in 1, 10, 100, 1000:
+        if rank > k:
+            break
+        ninter = 0
+        for i in range(nq):
+            ninter += np.intersect1d(I[i, :rank], gt[i, :rank]).size
+        inter = ninter / (nq * rank)
+        s += "@%d: %.4f " % (rank, inter)
+    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
+    print(s)
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('dataset options')
+
+    aa('--db', default='deep1M', help='dataset')
+    aa('--measure', default="1-recall",
+        help="perf measure to use: 1-recall or inter")
+    aa('--download', default=False, action="store_true")
+    aa('--lib', default='faiss', help='library to use (faiss or scann)')
+    aa('--thenscann', default=False, action="store_true")
+    aa('--base_dir', default='/checkpoint/matthijs/faiss_improvements/cmp_ivf_scan_2')
+
+    group = parser.add_argument_group('searching')
+    aa('--k', default=10, type=int, help='nb of nearest neighbors')
+    aa('--pre_reorder_k', default="0,10,100,1000", help='values for reorder_k')
+    aa('--nprobe', default="1,2,5,10,20,50,100,200", help='values for nprobe')
+    aa('--nrun', default=5, type=int, help='nb of runs to perform')
+    args = parser.parse_args()
+
+    print("args:", args)
+    pre_reorder_k_tab = [int(x) for x in args.pre_reorder_k.split(',')]
+    nprobe_tab = [int(x) for x in args.nprobe.split(',')]
+
+    os.system('echo -n "nb processors "; '
+            'cat /proc/cpuinfo | grep ^processor | wc -l; '
+            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+    cache_dir = args.base_dir + "/" + args.db + "/"
+    k = args.k
+    nrun = args.nrun
+
+    if not os.path.exists(cache_dir + "xb.npy"):
+        # prepare cache
+        from datasets import load_dataset
+        ds = load_dataset(args.db, download=args.download)
+        print(ds)
+        # store for SCANN
+        os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
+        tosave = dict(
+            xb = ds.get_database(),
+            xq = ds.get_queries(),
+            gt = ds.get_groundtruth()
+        )
+        for name, v in tosave.items():
+            fname = cache_dir + "/" + name + ".npy"
+            print("save", fname)
+            np.save(fname, v)
+
+        open(cache_dir + "metric", "w").write(ds.metric)
+        
+    dataset = {}
+    for kn in "xb xq gt".split():
+        fname = cache_dir + "/" + kn + ".npy"
+        print("load", fname)
+        dataset[kn] = np.load(fname)
+    xb = dataset["xb"]
+    xq = dataset["xq"]
+    gt = dataset["gt"] 
+    distance_measure = open(cache_dir + "metric").read()
+    
+    if args.lib == "faiss":
+        import faiss
+
+        name1_to_metric = {
+            "IP": faiss.METRIC_INNER_PRODUCT,
+            "L2": faiss.METRIC_L2
+        }
+
+        index_fname = cache_dir + "index.faiss"
+        if not os.path.exists(index_fname):
+            index = faiss_make_index(
+                xb, name1_to_metric[distance_measure], index_fname)
+        else:
+            index = faiss.read_index(index_fname)
+
+        faiss_eval_search(
+                index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+                nrun, args.measure
+        )
+
+    if args.lib == "scann":
+        from scann.scann_ops.py import scann_ops_pybind
+
+        name1_to_name2 = {
+            "IP": "dot_product",
+            "L2": "squared_l2"
+        }
+
+        scann_dir = cache_dir + "/scann1.1.1_serialized"
+        if os.path.exists(scann_dir + "/scann_config.pb"):
+            searcher = scann_ops_pybind.load_searcher(scann_dir)
+        else:
+            searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
+
+        scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
+        if os.path.exists(scann_dir + "/scann_config.pb"):
+            searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
+        else:
+            searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
+
+        scann_eval_search(
+            searcher, searcher_reo,
+            xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+            nrun, args.measure
+        )
+
+    if args.lib != "scann" and args.thenscann:
+        # just append --lib scann, that will override the previous cmdline
+        # options
+        cmdline = " ".join(sys.argv) + " --lib scann"
+        cmdline = (
+            ". ~/anaconda3/etc/profile.d/conda.sh ; " +
+            "conda activate scann_1.1.1; "
+            "python -u " + cmdline)
+
+        print("running", cmdline)
+
+        os.system(cmdline)
+
+
+###############################################################
+# SCANN
+###############################################################
+
+def scann_make_index(xb, distance_measure, scann_dir, reorder_k):
+    import scann
+
+    print("build index")
+
+    if distance_measure == "dot_product":
+        thr = 0.2
+    else:
+        thr = 0
+    k = 10
+    sb = scann.scann_ops_pybind.builder(xb, k, distance_measure)
+    sb = sb.tree(num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000)
+    sb = sb.score_ah(2, anisotropic_quantization_threshold=thr)
+
+    if reorder_k > 0:
+        sb = sb.reorder(reorder_k)
+
+    searcher = sb.build()
+
+    print("done")
+
+    print("write index to", scann_dir)
+
+    os.system(f"rm -rf {scann_dir}; mkdir -p {scann_dir}")
+    # os.mkdir(scann_dir)
+    searcher.serialize(scann_dir)
+    return searcher
+
+def scann_eval_search(
+        searcher, searcher_reo,
+        xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+        nrun, measure):
+
+    # warmup
+    for _run in range(5):
+        searcher.search_batched(xq)
+
+    for nprobe in nprobe_tab:
+
+        for pre_reorder_k in pre_reorder_k_tab:
+
+            times = []
+            for _run in range(nrun):
+                if pre_reorder_k == 0:
+                    t0 = time.time()
+                    I, D = searcher.search_batched(
+                        xq, leaves_to_search=nprobe, final_num_neighbors=k
+                    )
+                    t1 = time.time()
+                else:
+                    t0 = time.time()
+                    I, D = searcher_reo.search_batched(
+                        xq, leaves_to_search=nprobe, final_num_neighbors=k,
+                        pre_reorder_num_neighbors=pre_reorder_k
+                    )
+                    t1 = time.time()
+
+                times.append(t1 - t0)
+            header = "SCANN nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
+            if measure == "1-recall":
+                eval_recalls(header, I, gt, times)
+            else:
+                eval_inters(header, I, gt, times)
+
+
+
+
+###############################################################
+# Faiss
+###############################################################
+
+
+def faiss_make_index(xb, metric_type, fname):
+    import faiss
+
+    d = xb.shape[1]
+    M = d // 2
+    index = faiss.index_factory(d, f"IVF2000,PQ{M}x4fs", metric_type)
+    # if not by_residual:
+    #    print("setting no residual")
+    #    index.by_residual = False
+
+    print("train")
+    index.train(xb[:250000])
+    print("add")
+    index.add(xb)
+    print("write index", fname)
+    faiss.write_index(index, fname)
+
+    return index
+
+def faiss_eval_search(
+            index, xq, xb, nprobe_tab, pre_reorder_k_tab,
+            k, gt, nrun, measure
+    ):
+    import faiss
+
+    print("use precomputed table=", index.use_precomputed_table,
+          "by residual=", index.by_residual)
+
+    print("adding a refine index")
+    index_refine = faiss.IndexRefineFlat(index, faiss.swig_ptr(xb))
+
+    print("set single thread")
+    faiss.omp_set_num_threads(1)
+
+    print("warmup")
+    for _run in range(5):
+        index.search(xq, k)
+
+    print("run timing")
+    for nprobe in nprobe_tab:
+        for pre_reorder_k in pre_reorder_k_tab:
+            index.nprobe = nprobe
+            times = []
+            for _run in range(nrun):
+                if pre_reorder_k == 0:
+                    t0 = time.time()
+                    D, I = index.search(xq, k)
+                    t1 = time.time()
+                else:
+                    index_refine.k_factor = pre_reorder_k / k
+                    t0 = time.time()
+                    D, I = index_refine.search(xq, k)
+                    t1 = time.time()
+
+                times.append(t1 - t0)
+
+            header = "Faiss nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
+            if measure == "1-recall":
+                eval_recalls(header, I, gt, times)
+            else:
+                eval_inters(header, I, gt, times)
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+import time
+import numpy as np
+import faiss
+
+from faiss.contrib import datasets as faiss_datasets
+
+print("path:", faiss_datasets.__file__)
+
+faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+#################################################################
+# Dataset
+#################################################################
+
+class DatasetCentroids(faiss_datasets.Dataset):
+
+    def __init__(self, ds, indexfile):
+        self.d = ds.d
+        self.metric = ds.metric
+        self.nq = ds.nq
+        self.xq = ds.get_queries()
+
+        # get the xb set
+        src_index = faiss.read_index(indexfile)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = faiss.vector_to_array(src_quant.xb)
+        self.xb = centroids.reshape(-1, self.d)
+        self.nb = self.nt = len(self.xb)
+
+    def get_queries(self):
+        return self.xq
+
+    def get_database(self):
+        return self.xb
+
+    def get_train(self, maxtrain=None):
+        return self.xb
+
+    def get_groundtruth(self, k=100):
+        return faiss.knn(
+            self.xq, self.xb, k,
+            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
+        )[1]
+
+
+
+
+
+
+def load_dataset(dataset='deep1M', compute_gt=False, download=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        return faiss_datasets.DatasetSIFT1M()
+
+    elif dataset.startswith('bigann'):
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+
+        return faiss_datasets.DatasetBigANN(nb_M=dbsize)
+
+    elif dataset.startswith("deep_centroids_"):
+        ncent = int(dataset[len("deep_centroids_"):])
+        centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
+        return DatasetCentroids(
+            faiss_datasets.DatasetDeep1B(nb=1000000),
+            f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
+        )
+
+    elif dataset.startswith("deep"):
+
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+        return faiss_datasets.DatasetDeep1B(nb=dbsize)
+
+    elif dataset == "music-100":
+        return faiss_datasets.DatasetMusic100()
+
+    elif dataset == "glove":
+        return faiss_datasets.DatasetGlove(download=download)
+
+    else:
+        assert False
+
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+# https://stackoverflow.com/questions/7016056/python-logging-not-outputting-anything
+logging.basicConfig()
+logger = logging.getLogger('faiss.contrib.exhaustive_search')
+logger.setLevel(logging.INFO)
+
+from faiss.contrib import datasets
+from faiss.contrib.exhaustive_search import knn_ground_truth
+from faiss.contrib import vecs_io
+
+ds = datasets.DatasetDeep1B(nb=int(1e9))
+
+print("computing GT matches for", ds)
+
+D, I = knn_ground_truth(
+    ds.get_queries(),
+    ds.database_iterator(bs=65536),
+    k=100
+)
+
+vecs_io.ivecs_write("/tmp/tt.ivecs", I)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
@@ -0,0 +1,502 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+from collections import defaultdict
+from matplotlib import pyplot
+
+import re
+
+from argparse import Namespace
+
+from faiss.contrib.factory_tools import get_code_size as unitsize
+
+
+def dbsize_from_name(dbname):
+    sufs = {
+        '1B': 10**9,
+        '100M': 10**8,
+        '10M': 10**7,
+        '1M': 10**6,
+    }
+    for s in sufs:
+        if dbname.endswith(s):
+            return sufs[s]
+    else:
+        assert False
+
+
+def keep_latest_stdout(fnames):
+    fnames = [fname for fname in fnames if fname.endswith('.stdout')]
+    fnames.sort()
+    n = len(fnames)
+    fnames2 = []
+    for i, fname in enumerate(fnames):
+        if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
+            continue
+        fnames2.append(fname)
+    return fnames2
+
+
+def parse_result_file(fname):
+    # print fname
+    st = 0
+    res = []
+    keys = []
+    stats = {}
+    stats['run_version'] = fname[-8]
+    indexkey = None
+    for l in open(fname):
+        if l.startswith("srun:"):
+            # looks like a crash...
+            if indexkey is None:
+                raise RuntimeError("instant crash")
+            break
+        elif st == 0:
+            if l.startswith("dataset in dimension"):
+                fi = l.split()
+                stats["d"] = int(fi[3][:-1])
+                stats["nq"] = int(fi[9])
+                stats["nb"] = int(fi[11])
+                stats["nt"] = int(fi[13])
+            if l.startswith('index size on disk:'):
+                stats['index_size'] = int(l.split()[-1])
+            if l.startswith('current RSS:'):
+                stats['RSS'] = int(l.split()[-1])
+            if l.startswith('precomputed tables size:'):
+                stats['tables_size'] = int(l.split()[-1])
+            if l.startswith('Setting nb of threads to'):
+                stats['n_threads'] = int(l.split()[-1])
+            if l.startswith('  add in'):
+                stats['add_time'] = float(l.split()[-2])
+            if l.startswith("vector code_size"):
+                stats['code_size'] = float(l.split()[-1])
+            if l.startswith('args:'):
+                args = eval(l[l.find(' '):])
+                indexkey = args.indexkey
+            elif "time(ms/q)" in l:
+                # result header
+                if 'R@1   R@10  R@100' in l:
+                    stats["measure"] = "recall"
+                    stats["ranks"] = [1, 10, 100]
+                elif 'I@1   I@10  I@100' in l:
+                    stats["measure"] = "inter"
+                    stats["ranks"] = [1, 10, 100]
+                elif 'inter@' in l:
+                    stats["measure"] = "inter"
+                    fi = l.split()
+                    if fi[1] == "inter@":
+                        rank = int(fi[2])
+                    else:
+                        rank = int(fi[1][len("inter@"):])
+                    stats["ranks"] = [rank]
+
+                else:
+                    assert False
+                st = 1
+            elif 'index size on disk:' in l:
+                stats["index_size"] = int(l.split()[-1])
+        elif st == 1:
+            st = 2
+        elif st == 2:
+            fi = l.split()
+            if l[0] == " ":
+                # means there are 0 parameters
+                fi = [""] + fi
+            keys.append(fi[0])
+            res.append([float(x) for x in fi[1:]])
+    return indexkey, np.array(res), keys, stats
+
+# the directory used in run_on_cluster.bash
+basedir = "/checkpoint/matthijs/bench_all_ivf/"
+logdir = basedir + 'logs/'
+
+
+def collect_results_for(db='deep1M', prefix="autotune."):
+    # run parsing
+    allres = {}
+    allstats = {}
+    missing = []
+
+    fnames = keep_latest_stdout(os.listdir(logdir))
+    # print fnames
+    # filenames are in the form <key>.x.stdout
+    # where x is a version number (from a to z)
+    # keep only latest version of each name
+
+    for fname in fnames:
+        if not (
+                'db' + db in fname and
+                fname.startswith(prefix) and
+                fname.endswith('.stdout')
+            ):
+            continue
+        print("parse", fname, end="   ", flush=True)
+        try:
+            indexkey, res, _, stats = parse_result_file(logdir + fname)
+        except RuntimeError as e:
+            print("FAIL %s" % e)
+            res = np.zeros((2, 0))
+        except Exception as e:
+            print("PARSE ERROR " + e)
+            res = np.zeros((2, 0))
+        else:
+            print(len(res), "results")
+        if res.size == 0:
+            missing.append(fname)
+        else:
+            if indexkey in allres:
+                if allstats[indexkey]['run_version'] > stats['run_version']:
+                    # don't use this run
+                    continue
+
+            allres[indexkey] = res
+            allstats[indexkey] = stats
+
+    return allres, allstats
+
+def extract_pareto_optimal(allres, keys, recall_idx=0, times_idx=3):
+    bigtab = []
+    for i, k in enumerate(keys):
+        v = allres[k]
+        perf = v[:, recall_idx]
+        times = v[:, times_idx]
+        bigtab.append(
+            np.vstack((
+                np.ones(times.size) * i,
+                perf, times
+            ))
+        )
+    if bigtab == []:
+        return [], np.zeros((3, 0))
+
+    bigtab = np.hstack(bigtab)
+
+    # sort by perf
+    perm = np.argsort(bigtab[1, :])
+    bigtab_sorted = bigtab[:, perm]
+    best_times = np.minimum.accumulate(bigtab_sorted[2, ::-1])[::-1]
+    selection, = np.where(bigtab_sorted[2, :] == best_times)
+    selected_keys = [
+        keys[i] for i in
+        np.unique(bigtab_sorted[0, selection].astype(int))
+    ]
+    ops = bigtab_sorted[:, selection]
+
+    return selected_keys, ops
+
+def plot_subset(
+    allres, allstats, selected_methods, recall_idx, times_idx=3,
+    report=["overhead", "build time"]):
+
+    # important methods
+    for k in selected_methods:
+        v = allres[k]
+
+        stats = allstats[k]
+        d = stats["d"]
+        dbsize = stats["nb"]
+        if "index_size" in stats and "tables_size" in stats:
+            tot_size = stats['index_size'] + stats['tables_size']
+        else:
+            tot_size = -1
+        id_size = 8 # 64 bit
+
+        addt = ''
+        if 'add_time' in stats:
+            add_time = stats['add_time']
+            if add_time > 7200:
+                add_min = add_time / 60
+                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
+            else:
+                add_sec = int(add_time)
+                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
+
+        code_size = unitsize(d, k)
+
+        label = k
+
+        if "code_size" in report:
+            label += " %d bytes" % code_size
+
+        tight_size = (code_size + id_size) * dbsize
+
+        if tot_size < 0 or "overhead" not in report:
+            pass # don't know what the index size is
+        elif tot_size > 10 * tight_size:
+            label += " overhead x%.1f" % (tot_size / tight_size)
+        else:
+            label += " overhead+%.1f%%" % (
+                tot_size / tight_size * 100 - 100)
+
+        if "build time" in report:
+            label += " " + addt
+
+        linestyle = (':' if 'Refine' in k or 'RFlat' in k else
+                     '-.' if 'SQ' in k else
+                     '-' if '4fs' in k else
+                     '-')
+        print(k, linestyle)
+        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=label,
+                        linestyle=linestyle,
+                        marker='o' if '4fs' in k else '+')
+
+    recall_rank = stats["ranks"][recall_idx]
+    if stats["measure"] == "recall":
+        pyplot.xlabel('1-recall at %d' % recall_rank)
+    elif stats["measure"] == "inter":
+        pyplot.xlabel('inter @ %d' % recall_rank)
+    else:
+        assert False
+    pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])
+
+
+def plot_tradeoffs(db, allres, allstats, code_size, recall_rank):
+    stat0 = next(iter(allstats.values()))
+    d = stat0["d"]
+    n_threads = stat0["n_threads"]
+    recall_idx = stat0["ranks"].index(recall_rank)
+    # times come after the perf measure
+    times_idx = len(stat0["ranks"])
+
+    if type(code_size) == int:
+        if code_size == 0:
+            code_size = [0, 1e50]
+            code_size_name = "any code size"
+        else:
+            code_size_name = "code_size=%d" % code_size
+            code_size = [code_size, code_size]
+    elif type(code_size) == tuple:
+        code_size_name = "code_size in [%d, %d]" % code_size
+    else:
+        assert False
+
+    names_maxperf = []
+
+    for k in sorted(allres):
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(d, k)
+        if not code_size[0] <= us <= code_size[1]: continue
+        names_maxperf.append((v[-1, recall_idx], k))
+
+    # sort from lowest to highest topline accuracy
+    names_maxperf.sort()
+    names = [name for mp, name in names_maxperf]
+
+    selected_methods, optimal_points =  \
+        extract_pareto_optimal(allres, names, recall_idx, times_idx)
+
+    not_selected = list(set(names) - set(selected_methods))
+
+    print("methods without an optimal OP: ", not_selected)
+
+    pyplot.title('database ' + db + ' ' + code_size_name)
+
+    # grayed out lines
+
+    for k in not_selected:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(d, k)
+        if not code_size[0] <= us <= code_size[1]: continue
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=None,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+',
+                        color='#cccccc', linewidth=0.2)
+
+    plot_subset(allres, allstats, selected_methods, recall_idx, times_idx)
+
+
+    if len(not_selected) == 0:
+        om = ''
+    else:
+        om = '\nomitted:'
+        nc = len(om)
+        for m in not_selected:
+            if nc > 80:
+                om += '\n'
+                nc = 0
+            om += ' ' + m
+            nc += len(m) + 1
+
+    # pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s")
+    # print(optimal_points[0, :])
+    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
+    pyplot.ylabel('QPS (%d threads)' % n_threads)
+    pyplot.legend()
+    pyplot.grid()
+    return selected_methods, not_selected
+
+
+
+if __name__ == "__main__xx":
+    # tests on centroids indexing (v1)
+
+    for k in 1, 32, 128:
+        pyplot.gcf().set_size_inches(15, 10)
+        i = 1
+        for ncent in 65536, 262144, 1048576, 4194304:
+            db = f'deep_centroids_{ncent}.k{k}.'
+            allres, allstats = collect_results_for(
+                db=db, prefix="cent_index.")
+
+            pyplot.subplot(2, 2, i)
+            plot_subset(
+                allres, allstats, list(allres.keys()),
+                recall_idx=0,
+                times_idx=1,
+                report=["code_size"]
+            )
+            i += 1
+            pyplot.title(f"{ncent} centroids")
+            pyplot.legend()
+            pyplot.xlim([0.95, 1])
+            pyplot.grid()
+
+        pyplot.savefig('figs/deep1B_centroids_k%d.png' % k)
+
+
+if __name__ == "__main__xx":
+    # centroids plot per k
+
+    pyplot.gcf().set_size_inches(15, 10)
+
+    i=1
+    for ncent in 65536, 262144, 1048576, 4194304:
+
+        xyd = defaultdict(list)
+
+        for k in 1, 4, 8, 16, 32, 64, 128, 256:
+
+            db = f'deep_centroids_{ncent}.k{k}.'
+            allres, allstats = collect_results_for(db=db, prefix="cent_index.")
+
+            for indexkey, res in allres.items():
+                idx, = np.where(res[:, 0] >= 0.99)
+                if idx.size > 0:
+                    xyd[indexkey].append((k, 1000 / res[idx[0], 1]))
+
+        pyplot.subplot(2, 2, i)
+        i += 1
+        for indexkey, xy in xyd.items():
+            xy = np.array(xy)
+            pyplot.loglog(xy[:, 0], xy[:, 1], 'o-', label=indexkey)
+
+        pyplot.title(f"{ncent} centroids")
+        pyplot.xlabel("k")
+        xt = 2**np.arange(9)
+        pyplot.xticks(xt, ["%d" % x for x in xt])
+        pyplot.ylabel("QPS (32 threads)")
+        pyplot.legend()
+        pyplot.grid()
+
+    pyplot.savefig('../plots/deep1B_centroids_min99.png')
+
+
+
+
+
+if __name__ == "__main__xx":
+    # main indexing plots
+
+    i = 0
+    for db in 'bigann10M', 'deep10M', 'bigann100M', 'deep100M', 'deep1B', 'bigann1B':
+        allres, allstats = collect_results_for(
+            db=db, prefix="autotune.")
+
+        for cs in 8, 16, 32, 64:
+            pyplot.figure(i)
+            i += 1
+            pyplot.gcf().set_size_inches(15, 10)
+
+            cs_range = (
+                (0, 8) if cs == 8 else (cs // 2 + 1, cs)
+            )
+
+            plot_tradeoffs(
+                db, allres, allstats, code_size=cs_range, recall_rank=1)
+            pyplot.savefig('../plots/tradeoffs_%s_cs%d_r1.png' % (
+                   db, cs))
+
+
+if __name__ == "__main__":
+    # 1M indexes
+    i = 0
+    for db in "glove", "music-100":
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        allres, allstats = collect_results_for(db=db, prefix="autotune.")
+        plot_tradeoffs(db, allres, allstats, code_size=0, recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + ".png")
+
+    for db in "sift1M", "deep1M":
+        allres, allstats = collect_results_for(db=db, prefix="autotune.")
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        plot_tradeoffs(db, allres, allstats, code_size=(0, 64), recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_small.png")
+
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        plot_tradeoffs(db, allres, allstats, code_size=(65, 10000), recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_large.png")
+
+
+
+if __name__ == "__main__xx":
+    db = 'sift1M'
+    allres, allstats = collect_results_for(db=db, prefix="autotune.")
+    pyplot.gcf().set_size_inches(15, 10)
+
+    keys = [
+        "IVF1024,PQ32x8",
+        "IVF1024,PQ64x4",
+        "IVF1024,PQ64x4fs",
+        "IVF1024,PQ64x4fsr",
+        "IVF1024,SQ4",
+        "IVF1024,SQ8"
+    ]
+
+    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
+
+    pyplot.legend()
+    pyplot.title(db)
+    pyplot.xlabel("1-recall@1")
+    pyplot.ylabel("QPS (32 threads)")
+    pyplot.grid()
+
+    pyplot.savefig('../plots/ivf1024_variants.png')
+
+    pyplot.figure(2)
+    pyplot.gcf().set_size_inches(15, 10)
+
+    keys = [
+        "HNSW32",
+        "IVF1024,PQ64x4fs",
+        "IVF1024,PQ64x4fsr",
+        "IVF1024,PQ64x4fs,RFlat",
+        "IVF1024,PQ64x4fs,Refine(SQfp16)",
+        "IVF1024,PQ64x4fs,Refine(SQ8)",
+    ]
+
+    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
+
+    pyplot.legend()
+    pyplot.title(db)
+    pyplot.xlabel("1-recall@1")
+    pyplot.ylabel("QPS (32 threads)")
+    pyplot.grid()
+
+    pyplot.savefig('../plots/ivf1024_rerank.png')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
@@ -0,0 +1,603 @@
+set -e
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @nolint
+
+# This script launches the experiments on a cluster
+# It assumes two shell functions are defined:
+#
+#    run_on_1machine: runs a command on one (full) machine on a cluster
+#
+#    run_on_8gpu: runs a command on one machine with 8 GPUs
+#
+# the two functions are called as:
+#
+#    run_on_1machine <name> <command>
+#
+# the stdout of the command should be stored in $logdir/<name>.stdout
+
+
+function run_on ()
+{
+    sys="$1"
+    shift
+    name="$1"
+    shift
+    script="$logdir/$name.sh"
+
+    if [ -e "$script" ]; then
+        echo script "$script" exists
+        return
+    fi
+
+    # srun handles special characters fine, but the shell interpreter
+    # does not
+    escaped_cmd=$( printf "%q " "$@" )
+
+    cat > $script <<EOF
+#! /bin/bash
+srun $escaped_cmd
+EOF
+
+    echo -n "$logdir/$name.stdout "
+    sbatch -n1 -J "$name" \
+           $sys \
+            --comment='priority is the only one that works'  \
+           --output="$logdir/$name.stdout" \
+           "$script"
+
+}
+
+
+function run_on_1machine {
+    run_on "--cpus-per-task=80 --gres=gpu:0 --mem=500G --time=70:00:00 --partition=priority" "$@"
+}
+
+function run_on_1machine_1h {
+    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=1:00:00 --partition=priority" "$@"
+}
+
+function run_on_1machine_3h {
+    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=3:00:00 --partition=priority" "$@"
+}
+
+function run_on_4gpu_3h {
+    run_on "--cpus-per-task=40 --gres=gpu:4 --mem=100G --time=3:00:00 --partition=priority" "$@"
+}
+
+function run_on_8gpu () {
+    run_on "--cpus-per-task=80 --gres=gpu:8 --mem=100G --time=70:00:00 --partition=priority" "$@"
+}
+
+
+# prepare output directories
+# set to some directory where all indexes, can be written.
+basedir=/checkpoint/matthijs/bench_all_ivf
+
+logdir=$basedir/logs
+indexdir=$basedir/indexes
+centdir=$basedir/precomputed_clusters
+
+mkdir -p $logdir $indexdir
+
+
+# adds an option to use a pretrained quantizer
+function add_precomputed_quantizer () {
+    local db="$1"
+    local coarse="$2"
+
+    case $db in
+        bigann*) rname=bigann ;;
+        deep*)   rname=deep ;;
+        sift1M) return;;
+        music-100) return ;;
+        glove) return ;;
+        *) echo "bad db"; exit 1;;
+    esac
+
+    case $coarse in
+        IVF65536*)
+            cname=clustering.db${rname}1M.IVF65536.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF262144*)
+            cname=clustering.db${rname}1M.IVF262144.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF1048576*)
+            cname=clustering.db${rname}1M.IVF1048576.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF4194304*)
+            cname=clustering.db${rname}1M.IVF4194304.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        *)
+        copt="" ;;
+    esac
+
+    echo $copt
+}
+
+function get_db_dim () {
+    local db="$1"
+    case $db in
+        sift1M) dim=128;;
+        bigann*) dim=128;;
+        deep*) dim=96;;
+        music-100) dim=100;;
+        glove) dim=100;;
+        *) echo "bad db"; exit 1;;
+    esac
+    echo $dim
+}
+
+
+# replace HD = half dim with the half of the dimension we need to handle
+# relying that variables are global by default...
+function replace_coarse_PQHD () {
+    local coarse="$1"
+    local dim=$2
+
+
+    coarseD=${coarse//PQHD/PQ$((dim/2))}
+    coarse16=${coarse//PQHD/PQ8}
+    coarse32=${coarse//PQHD/PQ16}
+    coarse64=${coarse//PQHD/PQ32}
+    coarse128=${coarse//PQHD/PQ64}
+    coarse256=${coarse//PQHD/PQ128}
+    coarse112=${coarse//PQHD/PQ56}
+
+}
+
+
+
+if false; then
+
+
+
+###############################################
+# comparison with SCANN
+
+for db in sift1M deep1M glove music-100
+do
+    opt=""
+    if [ $db == glove ]; then
+        opt="--measure inter"
+    fi
+
+    run_on_1machine_1h cmp_with_scann.$db.c \
+        python -u cmp_with_scann.py --db $db \
+        --lib faiss $opt --thenscann
+
+done
+
+
+
+
+############################### Preliminary SIFT1M experiment
+
+
+for db in sift1M  ; do
+
+    for coarse in  IVF1024
+    do
+        indexkeys="
+            HNSW32
+            $coarse,SQfp16
+            $coarse,SQ4
+            $coarse,SQ8
+            $coarse,PQ32x8
+            $coarse,PQ64x4
+            $coarse,PQ64x4fs
+            $coarse,PQ64x4fs,RFlat
+            $coarse,PQ64x4fs,Refine(SQfp16)
+            $coarse,PQ64x4fs,Refine(SQ8)
+            OPQ64,$coarse,PQ64x4fs
+            OPQ64,$coarse,PQ64x4fs,RFlat
+        "
+        indexkeys="
+            $coarse,PQ64x4fsr
+            $coarse,PQ64x4fsr,RFlat
+        "
+
+        # OPQ actually degrades the results on SIFT1M, so let's ignore
+
+        for indexkey in $indexkeys
+        do
+            # escape nasty characters
+            key="autotune.db$db.${indexkey//,/_}"
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_1h $key.a \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32
+        done
+    done
+done
+
+
+
+
+############################### 1M experiments
+
+fi
+# for db in sift1M deep1M music-100 glove; do
+
+for db in glove music-100; do
+
+    dim=$( get_db_dim $db )
+
+    for coarse in IVF1024 IVF4096_HNSW32
+    do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+            $coarseD,PQ$((dim/2))x4fsr
+
+            OPQ8_64,$coarse64,PQ8
+            PCAR16,$coarse16,SQ4
+            OPQ16_64,$coarse64,PQ16x4fs
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            PCAR16,$coarse16,SQ8
+            PCAR32,$coarse32,SQ4
+            OPQ32_64,$coarse64,PQ32x4fs
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR32,$coarse32,SQ8
+            PCAR64,$coarse64,SQ4
+            PCAR16,$coarse16,SQfp16
+            OPQ64_128,$coarse128,PQ64x4fs
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            PCAR64,$coarse64,SQ8
+            PCAR32,$coarse32,SQfp16
+            PCAR128,$coarse128,SQ4
+            OPQ128_256,$coarse256,PQ128x4fs
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
+            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
+            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
+            OPQ64_128,$coarse,PQ64x12
+
+            OPQ64_128,$coarse,PQ64x4fs,RFlat
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQfp16)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ8)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ6)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ4)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQfp16)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ8)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ6)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ4)
+
+        "
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+            $coarseD,PQ$((dim/2))x4fsr
+            $coarseD,PQ$((dim/2))x4fsr,RFlat
+            $coarseD,PQ$((dim/2))x4fsr,Refine(SQfp16)
+            $coarseD,PQ$((dim/2))x4fsr,Refine(SQ8)
+            $coarseD,PQ$((dim/4))x4fs
+            $coarseD,PQ$((dim/4))x4fsr
+            $coarseD,PQ$((dim/4))x4fsr,RFlat
+            $coarseD,PQ$((dim/4))x4fsr,Refine(SQfp16)
+            $coarseD,PQ$((dim/4))x4fsr,Refine(SQ8)
+            $coarseD,PQ$((dim/2))
+            $coarseD,PQ$((dim/4))
+            HNSW32,Flat
+        "
+
+        indexkeys="HNSW32,Flat"
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_3h $key.q \
+              python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile "$indexdir/$key.faissindex" \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --searchthreads 32 \
+                    --min_test_duration 3
+        done
+
+
+    done
+done
+
+if false; then
+
+############################################
+# precompute centroids on GPU for large vocabularies
+
+for db in deep1M bigann1M; do
+
+    for ncent in 262144 65536 1048576 4194304; do
+
+        key=clustering.db$db.IVF$ncent
+        run_on_4gpu_3h $key.e \
+            python -u bench_all_ivf.py \
+                --db $db \
+                --indexkey IVF$ncent,SQ8 \
+                --maxtrain 100000000  \
+                --indexfile $centdir/$key.faissindex \
+                --searchthreads 32 \
+                --min_test_duration 3 \
+                --add_bs 1000000 \
+                --train_on_gpu
+
+    done
+done
+
+###############################
+## coarse quantizer experiments on the centroids of deep1B
+
+
+for k in 4 8 16 64 256; do
+
+    for ncent in 65536 262144 1048576 4194304; do
+        db=deep_centroids_$ncent
+
+        # compute square root of ncent...
+        for(( ls=0; ncent > (1 << (2 * ls)); ls++)); do
+            echo -n
+        done
+        sncent=$(( 1 << ls ))
+
+        indexkeys="
+            IVF$((sncent/2)),PQ48x4fs,RFlat
+            IVF$((sncent*2)),PQ48x4fs,RFlat
+            HNSW32
+            PQ48x4fs
+            PQ48x4fs,RFlat
+            IVF$sncent,PQ48x4fs,RFlat
+        "
+
+        for indexkey in $indexkeys; do
+            key="cent_index.db$db.k$k.$indexkey"
+            run_on_1machine_1h "$key.b" \
+                    python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --inter \
+                    --searchthreads 32 \
+                    --k $k
+        done
+
+    done
+done
+
+
+############################### 10M experiments
+
+
+for db in deep10M bigann10M; do
+
+    coarses="
+        IVF65536(IVF256,PQHDx4fs,RFlat)
+        IVF16384_HNSW32
+        IVF65536_HNSW32
+        IVF262144_HNSW32
+        IVF262144(IVF512,PQHDx4fs,RFlat)
+    "
+
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses
+    do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+
+            OPQ8_64,$coarse64,PQ8
+            PCAR16,$coarse16,SQ4
+            OPQ16_64,$coarse64,PQ16x4fs
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            PCAR16,$coarse16,SQ8
+            PCAR32,$coarse32,SQ4
+            OPQ32_64,$coarse64,PQ32x4fs
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR32,$coarse32,SQ8
+            PCAR64,$coarse64,SQ4
+            PCAR16,$coarse16,SQfp16
+            OPQ64_128,$coarse128,PQ64x4fs
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            PCAR64,$coarse64,SQ8
+            PCAR32,$coarse32,SQfp16
+            PCAR128,$coarse128,SQ4
+            OPQ128_256,$coarse256,PQ128x4fs
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
+            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
+            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
+        "
+
+        indexkeys="
+            OPQ16_64,$coarse64,PQ16x4fsr
+            OPQ32_64,$coarse64,PQ32x4fsr
+            OPQ64_128,$coarse128,PQ64x4fsr
+            OPQ128_256,$coarse256,PQ128x4fsr
+        "
+
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_3h $key.l \
+              python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile "$indexdir/$key.faissindex" \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    --autotune_max nprobe:2000
+        done
+    done
+done
+
+
+############################### 100M experiments
+
+for db in deep100M bigann100M; do
+    coarses="
+        IVF65536_HNSW32
+        IVF262144_HNSW32
+        IVF262144(IVF512,PQHDx4fs,RFlat)
+        IVF1048576_HNSW32
+        IVF1048576(IVF1024,PQHDx4fs,RFlat)
+    "
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses
+    do
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            OPQ8_64,$coarse64,PQ8
+            OPQ16_64,$coarse64,PQ16x4fs
+
+            PCAR32,$coarse32,SQ4
+            OPQ16_64,$coarse64,PQ16
+            OPQ32_64,$coarse64,PQ32x4fs
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR64,$coarse64,SQ4
+            PCAR32,$coarse32,SQ8
+            OPQ64_128,$coarse128,PQ64x4fs
+
+            PCAR128,$coarse128,SQ4
+            OPQ64_128,$coarse128,PQ64
+
+            PCAR32,$coarse32,SQfp16
+            PCAR64,$coarse64,SQ8
+            OPQ128_256,$coarse256,PQ128x4fs
+
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+
+            $coarseD,PQ$((dim/2))x4fs
+        "
+
+        indexkeys="
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ64_128,$coarse128,PQ64x4fsr
+            OPQ32_64,$coarse64,PQ32x4fsr
+            OPQ16_64,$coarse64,PQ16x4fsr
+            OPQ16_64,$coarse64,PQ16x4fsr,Refine(OPQ56_112,PQ56)
+        "
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine $key.e \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --add_bs 1000000 \
+                    --autotune_max nprobe:2000
+
+        done
+    done
+done
+
+
+#################################
+# 1B-scale experiment
+
+
+
+for db in deep1B bigann1B; do
+    coarses="
+        IVF1048576_HNSW32
+        IVF4194304_HNSW32
+        IVF4194304(IVF1024,PQHDx4fs,RFlat)
+    "
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses; do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+
+        indexkeys="
+            OPQ8_64,$coarse64,PQ8
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+
+            $coarseD,PQ$((dim/2))x4fs
+        "
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine $key.d \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --add_bs 1000000 \
+                    --autotune_max nprobe:3000
+        done
+    done
+
+done
+
+fi
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_big_batch_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_big_batch_ivf.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import time
+
+import faiss
+
+import numpy as np
+
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.big_batch_search import big_batch_search
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+aa('--dim', type=int, default=64)
+aa('--size', default="S")
+
+group = parser.add_argument_group('index options')
+aa('--nlist', type=int, default=100)
+aa('--factory_string', default="", help="overrides nlist")
+aa('--k', type=int, default=10)
+aa('--nprobe', type=int, default=5)
+aa('--nt', type=int, default=-1, help="nb search threads")
+aa('--method', default="pairwise_distances", help="")
+
+args = parser.parse_args()
+print("args:", args)
+
+if args.size == "S":
+    ds = SyntheticDataset(32, 2000, 4000, 1000)
+elif args.size == "M":
+    ds = SyntheticDataset(32, 20000, 40000, 10000)
+elif args.size == "L":
+    ds = SyntheticDataset(32, 200000, 400000, 100000)
+else:
+    raise RuntimeError(f"dataset size {args.size} not supported")
+
+nlist = args.nlist
+nprobe = args.nprobe
+k = args.k
+
+
+def tic(name):
+    global tictoc
+    tictoc = (name, time.time())
+    print(name, end="\r", flush=True)
+
+
+def toc():
+    global tictoc
+    name, t0 = tictoc
+    dt = time.time() - t0
+    print(f"{name}: {dt:.3f} s")
+    return dt
+
+
+print(f"dataset {ds}, {nlist=:} {nprobe=:} {k=:}")
+
+if args.factory_string == "":
+    factory_string = f"IVF{nlist},Flat"
+else:
+    factory_string = args.factory_string
+
+print(f"instantiate {factory_string}")
+index = faiss.index_factory(ds.d, factory_string)
+
+if args.factory_string != "":
+    nlist = index.nlist
+
+print("nlist", nlist)
+
+tic("train")
+index.train(ds.get_train())
+toc()
+
+tic("add")
+index.add(ds.get_database())
+toc()
+
+if args.nt != -1:
+    print("setting nb of threads to", args.nt)
+    faiss.omp_set_num_threads(args.nt)
+
+tic("reference search")
+index.nprobe
+index.nprobe = nprobe
+Dref, Iref = index.search(ds.get_queries(), k)
+t_ref = toc()
+
+tic("block search")
+Dnew, Inew = big_batch_search(
+    index, ds.get_queries(),
+    k, method=args.method, verbose=10
+)
+t_tot = toc()
+
+assert (Inew != Iref).sum() / Iref.size < 1e-4
+np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+print(f"total block search time {t_tot:.3f} s, speedup {t_ref / t_tot:.3f}x")
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_cppcontrib_sa_decode.cpp
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_for_interrupt.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_for_interrupt.py
@@ -0,0 +1,154 @@
+#! /usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import numpy as np
+import faiss
+import time
+import os
+import argparse
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+group = parser.add_argument_group('dataset options')
+aa('--dim', type=int, default=64)
+aa('--nb', type=int, default=int(1e6))
+aa('--subset_len', type=int, default=int(1e5))
+aa('--key', default='IVF1000,Flat')
+aa('--nprobe', type=int, default=640)
+aa('--no_intcallback', default=False, action='store_true')
+aa('--twostage', default=False, action='store_true')
+aa('--nt', type=int, default=-1)
+
+
+args = parser.parse_args()
+print("args:", args)
+
+
+d = args.dim  # dimension
+nb = args.nb  # database size
+nq = 1000  # nb of queries
+nt = 100000
+subset_len = args.subset_len
+
+
+np.random.seed(1234)  # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xq = np.random.random((nq, d)).astype('float32')
+xt = np.random.random((nt, d)).astype('float32')
+k = 100
+
+if args.no_intcallback:
+    faiss.InterruptCallback.clear_instance()
+
+if args.nt != -1:
+    faiss.omp_set_num_threads(args.nt)
+
+nprobe = args.nprobe
+key = args.key
+#key = 'IVF1000,Flat'
+# key = 'IVF1000,PQ64'
+# key = 'IVF100_HNSW32,PQ64'
+
+# faiss.omp_set_num_threads(1)
+
+pf = 'dim%d_' % d
+if d == 64:
+    pf = ''
+
+basename = '/tmp/base%s%s.index' % (pf, key)
+
+if os.path.exists(basename):
+    print('load', basename)
+    index_1 = faiss.read_index(basename)
+else:
+    print('train + write', basename)
+    index_1 = faiss.index_factory(d, key)
+    index_1.train(xt)
+    faiss.write_index(index_1, basename)
+
+print('add')
+index_1.add(xb)
+
+print('set nprobe=', nprobe)
+faiss.ParameterSpace().set_index_parameter(index_1, 'nprobe', nprobe)
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+stats = faiss.cvar.indexIVF_stats
+stats.reset()
+
+print('index size', index_1.ntotal,
+      'imbalance', index_1.invlists.imbalance_factor())
+start = time.time()
+Dref, Iref = index_1.search(xq, k)
+print('time of searching: %.3f s = %.3f + %.3f ms' % (
+    time.time() - start, stats.quantization_time, stats.search_time))
+
+indexes = {}
+if args.twostage:
+
+    for i in range(0, nb, subset_len):
+        index = faiss.read_index(basename)
+        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
+        print("add %d:%d" %(i, i+subset_len))
+        index.add(xb[i:i + subset_len])
+        indexes[i] = index
+
+rh = ResultHeap(nq, k)
+sum_time = tq = ts = 0
+for i in range(0, nb, subset_len):
+    if not args.twostage:
+        index = faiss.read_index(basename)
+        faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
+        print("add %d:%d" %(i, i+subset_len))
+        index.add(xb[i:i + subset_len])
+    else:
+        index = indexes[i]
+
+    stats.reset()
+    start = time.time()
+    Di, Ii = index.search(xq, k)
+    sum_time = sum_time + time.time() - start
+    tq += stats.quantization_time
+    ts += stats.search_time
+    rh.add_batch_result(Di, Ii, i)
+
+print('time of searching separately: %.3f s = %.3f + %.3f ms' %
+      (sum_time, tq, ts))
+
+rh.finalize()
+
+print('diffs: %d / %d'  % ((Iref != rh.I).sum(), Iref.size))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/init.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/init.py
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark.py
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark_io.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/benchmark_io.py
@@ -0,0 +1,272 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import hashlib
+import io
+import json
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from zipfile import ZipFile
+
+import faiss  # @manual=//faiss/python:pyfaiss
+
+import numpy as np
+import submitit
+from faiss.contrib.datasets import (  # @manual=//faiss/contrib:faiss_contrib
+    dataset_from_name,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# merge RCQ coarse quantizer and ITQ encoder to one Faiss index
+def merge_rcq_itq(
+    # pyre-ignore[11]: `faiss.ResidualCoarseQuantizer` is not defined as a type
+    rcq_coarse_quantizer: faiss.ResidualCoarseQuantizer,
+    itq_encoder: faiss.IndexPreTransform,
+    # pyre-ignore[11]: `faiss.IndexIVFSpectralHash` is not defined as a type.
+) -> faiss.IndexIVFSpectralHash:
+    # pyre-ignore[16]: `faiss` has no attribute `IndexIVFSpectralHash`.
+    index = faiss.IndexIVFSpectralHash(
+        rcq_coarse_quantizer,
+        rcq_coarse_quantizer.d,
+        rcq_coarse_quantizer.ntotal,
+        itq_encoder.sa_code_size() * 8,
+        1000000,  # larger than the magnitude of the vectors
+    )
+    index.replace_vt(itq_encoder)
+    return index
+
+
+@dataclass
+class BenchmarkIO:
+    path: str  # local path
+
+    def __init__(self, path: str):
+        self.path = path
+        self.cached_ds: Dict[Any, Any] = {}
+
+    def clone(self):
+        return BenchmarkIO(path=self.path)
+
+    def get_local_filepath(self, filename):
+        if len(filename) > 184:
+            fn, ext = os.path.splitext(filename)
+            filename = (
+                fn[:184] + hashlib.sha256(filename.encode()).hexdigest() + ext
+            )
+        return os.path.join(self.path, filename)
+
+    def get_remote_filepath(self, filename) -> Optional[str]:
+        return None
+
+    def download_file_from_blobstore(
+        self,
+        filename: str,
+        bucket: Optional[str] = None,
+        path: Optional[str] = None,
+    ):
+        return self.get_local_filepath(filename)
+
+    def upload_file_to_blobstore(
+        self,
+        filename: str,
+        bucket: Optional[str] = None,
+        path: Optional[str] = None,
+        overwrite: bool = False,
+    ):
+        pass
+
+    def file_exist(self, filename: str):
+        fn = self.get_local_filepath(filename)
+        exists = os.path.exists(fn)
+        logger.info(f"{filename} {exists=}")
+        return exists
+
+    def read_file(self, filename: str, keys: List[str]):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading file {fn}")
+        results = []
+        with ZipFile(fn, "r") as zip_file:
+            for key in keys:
+                with zip_file.open(key, "r") as f:
+                    if key in ["D", "I", "R", "lims"]:
+                        results.append(np.load(f))
+                    elif key in ["P"]:
+                        t = io.TextIOWrapper(f)
+                        results.append(json.load(t))
+                    else:
+                        raise AssertionError()
+        return results
+
+    def write_file(
+        self,
+        filename: str,
+        keys: List[str],
+        values: List[Any],
+        overwrite: bool = False,
+    ):
+        fn = self.get_local_filepath(filename)
+        with ZipFile(fn, "w") as zip_file:
+            for key, value in zip(keys, values, strict=True):
+                with zip_file.open(key, "w", force_zip64=True) as f:
+                    if key in ["D", "I", "R", "lims"]:
+                        np.save(f, value)
+                    elif key in ["P"]:
+                        t = io.TextIOWrapper(f, write_through=True)
+                        json.dump(value, t)
+                    else:
+                        raise AssertionError()
+        self.upload_file_to_blobstore(filename, overwrite=overwrite)
+
+    def get_dataset(self, dataset):
+        if dataset not in self.cached_ds:
+            if (
+                dataset.namespace is not None
+                and dataset.namespace[:4] == "std_"
+            ):
+                if dataset.tablename not in self.cached_ds:
+                    self.cached_ds[dataset.tablename] = dataset_from_name(
+                        dataset.tablename,
+                    )
+                p = dataset.namespace[4]
+                if p == "t":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_train(dataset.num_vectors)
+                elif p == "d":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_database()
+                elif p == "q":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_queries()
+                else:
+                    raise ValueError
+            elif dataset.namespace == "syn":
+                d, seed = dataset.tablename.split("_")
+                d = int(d)
+                seed = int(seed)
+                n = dataset.num_vectors
+                # based on faiss.contrib.datasets.SyntheticDataset
+                d1 = 10
+                rs = np.random.RandomState(seed)
+                x = rs.normal(size=(n, d1))
+                x = np.dot(x, rs.rand(d1, d))
+                x = x * (rs.rand(d) * 4 + 0.1)
+                x = np.sin(x)
+                x = x.astype(np.float32)
+                self.cached_ds[dataset] = x
+            else:
+                self.cached_ds[dataset] = self.read_nparray(
+                    os.path.join(self.path, dataset.tablename),
+                    mmap_mode="r",
+                )[: dataset.num_vectors].copy()
+        return self.cached_ds[dataset]
+
+    def read_nparray(
+        self,
+        filename: str,
+        mmap_mode: Optional[str] = None,
+    ):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading nparray from {fn}")
+        nparray = np.load(fn, mmap_mode=mmap_mode)
+        logger.info(f"Loaded nparray {nparray.shape} from {fn}")
+        return nparray
+
+    def write_nparray(
+        self,
+        nparray: np.ndarray,
+        filename: str,
+    ):
+        fn = self.get_local_filepath(filename)
+        logger.info(f"Saving nparray {nparray.shape} to {fn}")
+        np.save(fn, nparray)
+        self.upload_file_to_blobstore(filename)
+
+    def read_json(
+        self,
+        filename: str,
+    ):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading json {fn}")
+        with open(fn, "r") as fp:
+            json_dict = json.load(fp)
+        logger.info(f"Loaded json {json_dict} from {fn}")
+        return json_dict
+
+    def write_json(
+        self,
+        json_dict: dict[str, Any],
+        filename: str,
+        overwrite: bool = False,
+    ):
+        fn = self.get_local_filepath(filename)
+        logger.info(f"Saving json {json_dict} to {fn}")
+        with open(fn, "w") as fp:
+            json.dump(json_dict, fp)
+        self.upload_file_to_blobstore(filename, overwrite=overwrite)
+
+    def read_index(
+        self,
+        filename: str,
+        bucket: Optional[str] = None,
+        path: Optional[str] = None,
+    ):
+        fn = self.download_file_from_blobstore(filename, bucket, path)
+        logger.info(f"Loading index {fn}")
+        ext = os.path.splitext(fn)[1]
+        if ext in [".faiss", ".codec", ".index"]:
+            index = faiss.read_index(fn)
+        elif ext == ".pkl":
+            with open(fn, "rb") as model_file:
+                model = pickle.load(model_file)
+                rcq_coarse_quantizer, itq_encoder = model["model"]
+                index = merge_rcq_itq(rcq_coarse_quantizer, itq_encoder)
+        logger.info(f"Loaded index from {fn}")
+        return index
+
+    def write_index(
+        self,
+        index: faiss.Index,
+        filename: str,
+    ):
+        fn = self.get_local_filepath(filename)
+        logger.info(f"Saving index to {fn}")
+        faiss.write_index(index, fn)
+        self.upload_file_to_blobstore(filename)
+        assert os.path.exists(fn)
+        return os.path.getsize(fn)
+
+    def launch_jobs(self, func, params, local=True):
+        if local:
+            results = [func(p) for p in params]
+            return results
+        logger.info(f"launching {len(params)} jobs")
+        executor = submitit.AutoExecutor(folder="/checkpoint/gsz/jobs")
+        executor.update_parameters(
+            nodes=1,
+            gpus_per_node=8,
+            cpus_per_task=80,
+            # mem_gb=640,
+            tasks_per_node=1,
+            name="faiss_benchmark",
+            slurm_array_parallelism=512,
+            slurm_partition="scavenge",
+            slurm_time=4 * 60,
+            slurm_constraint="bldg1",
+        )
+        jobs = executor.map_array(func, params)
+        logger.info(f"launched {len(jobs)} jobs")
+        for job, param in zip(jobs, params):
+            logger.info(f"{job.job_id=} {param[0]=}")
+        results = [job.result() for job in jobs]
+        print(f"received {len(results)} results")
+        return results
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/descriptors.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/descriptors.py
@@ -0,0 +1,379 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import faiss  # @manual=//faiss/python:pyfaiss
+
+from .benchmark_io import BenchmarkIO
+from .utils import timer
+
+logger = logging.getLogger(__name__)
+
+
+# Important: filenames end with . without extension (npy, codec, index),
+# when writing files, you are required to filename + "npy" etc.
+
+@dataclass
+class IndexDescriptorClassic:
+    bucket: Optional[str] = None
+    # either path or factory should be set,
+    # but not both at the same time.
+    path: Optional[str] = None
+    factory: Optional[str] = None
+    codec_alias: Optional[str] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    search_params: Optional[Dict[str, int]] = None
+    # range metric definitions
+    # key: name
+    # value: one of the following:
+    #
+    # radius
+    #    [0..radius) -> 1
+    #    [radius..inf) -> 0
+    #
+    # [[radius1, score1], ...]
+    #    [0..radius1) -> score1
+    #    [radius1..radius2) -> score2
+    #
+    # [[radius1_from, radius1_to, score1], ...]
+    #    [radius1_from, radius1_to) -> score1,
+    #    [radius2_from, radius2_to) -> score2
+    range_metrics: Optional[Dict[str, Any]] = None
+    radius: Optional[float] = None
+    training_size: Optional[int] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+@dataclass
+class DatasetDescriptor:
+    # namespace possible values:
+    # 1. a hive namespace
+    # 2. 'std_t', 'std_d', 'std_q' for the standard datasets
+    #    via faiss.contrib.datasets.dataset_from_name()
+    #    t - training, d - database, q - queries
+    #    eg. "std_t"
+    # 3. 'syn' for synthetic data
+    # 4. None for local files
+    namespace: Optional[str] = None
+
+    # tablename possible values, corresponding to the
+    # namespace value above:
+    # 1. a hive table name
+    # 2. name of the standard dataset as recognized
+    #    by faiss.contrib.datasets.dataset_from_name()
+    #    eg. "bigann1M"
+    # 3. d_seed, eg. 128_1234 for 128 dimensional vectors
+    #    with seed 1234
+    # 4. a local file name (relative to benchmark_io.path)
+    tablename: Optional[str] = None
+
+    # partition names and values for hive
+    # eg. ["ds=2021-09-01"]
+    partitions: Optional[List[str]] = None
+
+    # number of vectors to load from the dataset
+    num_vectors: Optional[int] = None
+
+    embedding_column: Optional[str] = None
+
+    # only when the embedding column is a map
+    embedding_column_key: Optional[Any] = None
+
+    embedding_id_column: Optional[str] = None
+
+    # filters on the dataset where each filter is a
+    # string rep of a filter expression
+    filters: Optional[List[str]] = None
+
+    # unused in open-source
+    splits_distribution: Optional[List[List[bytes]]] = None
+
+    # unused in open-source
+    splits: Optional[List[bytes]] = None
+
+    # unused in open-source
+    serialized_df: Optional[str] = None
+
+    sampling_rate: Optional[float] = None
+
+    # sampling column for xdb
+    sampling_column: Optional[str] = None
+
+    # blob store
+    bucket: Optional[str] = None
+    path: Optional[str] = None
+
+    # desc_name
+    desc_name: Optional[str] = None
+
+    normalize_L2: bool = False
+
+    def __hash__(self):
+        return hash(self.get_filename())
+
+    def get_filename(
+        self,
+        prefix: Optional[str] = None,
+    ) -> str:
+        if self.desc_name is not None:
+            return self.desc_name
+
+        filename = ""
+        if prefix is not None:
+            filename += prefix + "_"
+        if self.namespace is not None:
+            filename += self.namespace + "_"
+        assert self.tablename is not None
+        filename += self.tablename
+        if self.partitions is not None:
+            filename += "_" + "_".join(
+                self.partitions
+            ).replace("=", "_").replace("/", "_")
+        if self.num_vectors is not None:
+            filename += f"_{self.num_vectors}"
+        filename += "."
+
+        self.desc_name = filename
+        return self.desc_name
+
+    def get_kmeans_filename(self, k):
+        return f"{self.get_filename()}kmeans_{k}."
+
+    def k_means(self, io, k, dry_run):
+        logger.info(f"k_means {k} {self}")
+        kmeans_vectors = DatasetDescriptor(
+            tablename=f"{self.get_filename()}kmeans_{k}"
+        )
+        kmeans_filename = kmeans_vectors.get_filename() + "npy"
+        meta_filename = kmeans_vectors.get_filename() + "json"
+        if not io.file_exist(kmeans_filename) or not io.file_exist(
+            meta_filename
+        ):
+            if dry_run:
+                return None, None, kmeans_filename
+            x = io.get_dataset(self)
+            kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
+            _, t, _ = timer("k_means", lambda: kmeans.train(x))
+            io.write_nparray(kmeans.centroids, kmeans_filename)
+            io.write_json({"k_means_time": t}, meta_filename)
+        else:
+            t = io.read_json(meta_filename)["k_means_time"]
+        return kmeans_vectors, t, None
+
+@dataclass
+class IndexBaseDescriptor:
+    d: int
+    metric: str
+    desc_name: Optional[str] = None
+    flat_desc_name: Optional[str] = None
+    bucket: Optional[str] = None
+    path: Optional[str] = None
+    num_threads: int = 1
+
+    def get_name(self) -> str:
+        raise NotImplementedError()
+
+    def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
+        if self.path is not None:
+            return self.path
+        self.path = benchmark_io.get_remote_filepath(self.desc_name)
+        return self.path
+
+    @staticmethod
+    def param_dict_list_to_name(param_dict_list):
+        if not param_dict_list:
+            return ""
+        l = 0
+        n = ""
+        for param_dict in param_dict_list:
+            n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
+            l += 1
+        return n
+
+    @staticmethod
+    def param_dict_to_name(param_dict, prefix="sp"):
+        if not param_dict:
+            return ""
+        n = prefix
+        for name, val in param_dict.items():
+            if name == "snap":
+                continue
+            if name == "lsq_gpu" and val == 0:
+                continue
+            if name == "use_beam_LUT" and val == 0:
+                continue
+            n += f"_{name}_{val}"
+        if n == prefix:
+            return ""
+        n += "."
+        return n
+
+
+@dataclass
+class CodecDescriptor(IndexBaseDescriptor):
+    # either path or factory should be set,
+    # but not both at the same time.
+    factory: Optional[str] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    training_vectors: Optional[DatasetDescriptor] = None
+    FILENAME_PREFIX: str = "xt"
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_trained(self):
+        return self.factory is None and self.path is not None
+
+    def is_valid(self):
+        return self.factory is not None or self.path is not None
+
+    def get_name(self) -> str:
+        if self.desc_name is not None:
+            return self.desc_name
+        if self.factory is not None:
+            self.desc_name = self.name_from_factory()
+            return self.desc_name
+        if self.path is not None:
+            self.desc_name = self.name_from_path()
+            return self.desc_name
+        raise ValueError("name, factory or path must be set")
+
+    def flat_name(self) -> str:
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
+        return self.flat_desc_name
+
+    def path(self, benchmark_io) -> str:
+        if self.path is not None:
+            return self.path
+        return benchmark_io.get_remote_filepath(self.get_name())
+
+    def name_from_factory(self) -> str:
+        assert self.factory is not None
+        name = f"{self.factory.replace(',', '_')}."
+        assert self.d is not None
+        assert self.metric is not None
+        name += f"d_{self.d}.{self.metric.upper()}."
+        if self.factory != "Flat":
+            assert self.training_vectors is not None
+            name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX)
+        name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
+        return name
+
+    def name_from_path(self):
+        assert self.path is not None
+        filename = os.path.basename(self.path)
+        ext = filename.split(".")[-1]
+        if filename.endswith(ext):
+            name = filename[:-len(ext)]
+        else: # should never hit this rather raise value error
+            name = filename
+        return name
+
+    def alias(self, benchmark_io: BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+
+@dataclass
+class IndexDescriptor(IndexBaseDescriptor):
+    codec_desc: Optional[CodecDescriptor] = None
+    database_desc: Optional[DatasetDescriptor] = None
+    FILENAME_PREFIX: str = "xb"
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_built(self):
+        return self.codec_desc is None and self.database_desc is None
+
+    def get_name(self) -> str:
+        if self.desc_name is None:
+            self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
+
+        return self.desc_name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
+        return self.flat_desc_name
+
+    # alias is used to refer when index is uploaded to blobstore and refered again
+    def alias(self, benchmark_io: BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+@dataclass
+class KnnDescriptor(IndexBaseDescriptor):
+    index_desc: Optional[IndexDescriptor] = None
+    gt_index_desc: Optional[IndexDescriptor] = None
+    query_dataset: Optional[DatasetDescriptor] = None
+    search_params: Optional[Dict[str, int]] = None
+    reconstruct: bool = False
+    FILENAME_PREFIX: str = "q"
+    # range metric definitions
+    # key: name
+    # value: one of the following:
+    #
+    # radius
+    #    [0..radius) -> 1
+    #    [radius..inf) -> 0
+    #
+    # [[radius1, score1], ...]
+    #    [0..radius1) -> score1
+    #    [radius1..radius2) -> score2
+    #
+    # [[radius1_from, radius1_to, score1], ...]
+    #    [radius1_from, radius1_to) -> score1,
+    #    [radius2_from, radius2_to) -> score2
+    range_metrics: Optional[Dict[str, Any]] = None
+    radius: Optional[float] = None
+    k: int = 1
+
+    range_ref_index_desc: Optional[str] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def get_name(self):
+        if self.desc_name is not None:
+            return self.desc_name
+        name = self.index_desc.get_name()
+        name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
+        name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        self.desc_name = name
+        return name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        name = self.index_desc.flat_name()
+        name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        self.flat_desc_name = name
+        return name
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/index.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/index.py
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/optimize.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/optimize.py
@@ -0,0 +1,335 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import faiss  # @manual=//faiss/python:pyfaiss
+
+# from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
+#     OperatingPoints,
+# )
+
+from .benchmark import Benchmark
+from .descriptors import DatasetDescriptor, IndexDescriptorClassic
+from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Optimizer:
+    distance_metric: str = "L2"
+    num_threads: int = 32
+    run_local: bool = True
+
+    def __post_init__(self):
+        self.cached_benchmark = None
+        if self.distance_metric == "IP":
+            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
+        elif self.distance_metric == "L2":
+            self.distance_metric_type = faiss.METRIC_L2
+        else:
+            raise ValueError
+
+    def set_io(self, benchmark_io):
+        self.io = benchmark_io
+        self.io.distance_metric = self.distance_metric
+        self.io.distance_metric_type = self.distance_metric_type
+
+    def benchmark_and_filter_candidates(
+        self,
+        index_descs,
+        training_vectors,
+        database_vectors,
+        query_vectors,
+        result_file,
+        include_flat,
+        min_accuracy,
+        pareto_metric,
+    ):
+        benchmark = Benchmark(
+            num_threads=self.num_threads,
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            index_descs=index_descs,
+            k=10,
+            distance_metric=self.distance_metric,
+        )
+        benchmark.set_io(self.io)
+        results = benchmark.benchmark(
+            result_file=result_file, local=self.run_local, train=True, knn=True
+        )
+        assert results
+        filtered = filter_results(
+            results=results,
+            evaluation="knn",
+            accuracy_metric="knn_intersection",
+            min_accuracy=min_accuracy,
+            name_filter=None
+            if include_flat
+            else (lambda n: not n.startswith("Flat")),
+            pareto_mode=ParetoMode.GLOBAL,
+            pareto_metric=pareto_metric,
+        )
+        assert filtered
+        index_descs = [
+            IndexDescriptorClassic(
+                factory=v["factory"],
+                construction_params=v["construction_params"],
+                search_params=v["search_params"],
+            )
+            for _, _, _, _, v in filtered
+        ]
+        return index_descs, filtered
+
+    def optimize_quantizer(
+        self,
+        training_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlists: List[int],
+        min_accuracy: float,
+    ):
+        quantizer_descs = {}
+        for nlist in nlists:
+            # cluster
+            centroids, _, _ = training_vectors.k_means(
+                self.io,
+                nlist,
+                dry_run=False,
+            )
+
+            descs = [IndexDescriptorClassic(factory="Flat"),] + [
+                IndexDescriptorClassic(
+                    factory="HNSW32",
+                    construction_params=[{"efConstruction": 2**i}],
+                )
+                for i in range(6, 11)
+            ]
+
+            descs, _ = self.benchmark_and_filter_candidates(
+                descs,
+                training_vectors=centroids,
+                database_vectors=centroids,
+                query_vectors=query_vectors,
+                result_file=f"result_{centroids.get_filename()}json",
+                include_flat=True,
+                min_accuracy=min_accuracy,
+                pareto_metric=ParetoMetric.TIME,
+            )
+            quantizer_descs[nlist] = descs
+
+        return quantizer_descs
+
+    def optimize_ivf(
+        self,
+        result_file: str,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        quantizers: Dict[int, List[IndexDescriptorClassic]],
+        codecs: List[Tuple[str, str]],
+        min_accuracy: float,
+    ):
+        ivf_descs = []
+        for nlist, quantizer_descs in quantizers.items():
+            # build IVF index
+            for quantizer_desc in quantizer_descs:
+                for pretransform, fine_ivf in codecs:
+                    if pretransform is None:
+                        pretransform = ""
+                    else:
+                        pretransform = pretransform + ","
+                    if quantizer_desc.construction_params is None:
+                        construction_params = [
+                            None,
+                            quantizer_desc.search_params,
+                        ]
+                    else:
+                        construction_params = [
+                            None
+                        ] + quantizer_desc.construction_params
+                        if quantizer_desc.search_params is not None:
+                            dict_merge(
+                                construction_params[1],
+                                quantizer_desc.search_params,
+                            )
+                    ivf_descs.append(
+                        IndexDescriptorClassic(
+                            factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
+                            construction_params=construction_params,
+                        )
+                    )
+        return self.benchmark_and_filter_candidates(
+            ivf_descs,
+            training_vectors,
+            database_vectors,
+            query_vectors,
+            result_file,
+            include_flat=False,
+            min_accuracy=min_accuracy,
+            pareto_metric=ParetoMetric.TIME_SPACE,
+        )
+
+    # train an IVFFlat index
+    # find the nprobe required for the given accuracy
+    def ivf_flat_nprobe_required_for_accuracy(
+        self,
+        result_file: str,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlist,
+        accuracy,
+    ):
+        _, results = self.benchmark_and_filter_candidates(
+            index_descs=[
+                IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
+            ],
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            result_file=result_file,
+            include_flat=False,
+            min_accuracy=accuracy,
+            pareto_metric=ParetoMetric.TIME,
+        )
+        nprobe = nlist // 2
+        for _, _, _, k, v in results:
+            if (
+                ".knn" in k
+                and "nprobe" in v["search_params"]
+                and v["knn_intersection"] >= accuracy
+            ):
+                nprobe = min(nprobe, v["search_params"]["nprobe"])
+        return nprobe
+
+    # train candidate IVF codecs
+    # benchmark them at the same nprobe
+    # keep only the space _and_ time Pareto optimal
+    def optimize_codec(
+        self,
+        result_file: str,
+        d: int,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlist: int,
+        nprobe: int,
+        min_accuracy: float,
+    ):
+        codecs = (
+            [
+                (None, "Flat"),
+                (None, "SQfp16"),
+                (None, "SQbf16"),
+                (None, "SQ8"),
+                (None, "SQ8_direct_signed"),
+            ] + [
+                (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
+                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
+                if d % M == 0
+                for dim in range(2, 18, 2)
+                if M * dim <= d
+                for b in range(4, 14, 2)
+                if M * b < d * 8  # smaller than SQ8
+            ] + [
+                (None, f"PQ{M}x{b}")
+                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
+                if d % M == 0
+                for b in range(8, 14, 2)
+                if M * b < d * 8  # smaller than SQ8
+            ]
+        )
+        factory = {}
+        for opq, pq in codecs:
+            factory[
+                f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
+            ] = (
+                opq,
+                pq,
+            )
+
+        _, filtered = self.benchmark_and_filter_candidates(
+            index_descs=[
+                IndexDescriptorClassic(
+                    factory=f"IVF{nlist},{pq}"
+                    if opq is None
+                    else f"{opq},IVF{nlist},{pq}",
+                    search_params={
+                        "nprobe": nprobe,
+                    },
+                )
+                for opq, pq in codecs
+            ],
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            result_file=result_file,
+            include_flat=False,
+            min_accuracy=min_accuracy,
+            pareto_metric=ParetoMetric.TIME_SPACE,
+        )
+        results = [
+            factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
+        ]
+        return results
+
+    def optimize(
+        self,
+        d: int,
+        training_vectors: DatasetDescriptor,
+        database_vectors_list: List[DatasetDescriptor],
+        query_vectors: DatasetDescriptor,
+        min_accuracy: float,
+    ):
+        # train an IVFFlat index
+        # find the nprobe required for near perfect accuracy
+        nlist = 4096
+        nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
+            result_file=f"result_ivf{nlist}_flat.json",
+            training_vectors=training_vectors,
+            database_vectors=database_vectors_list[0],
+            query_vectors=query_vectors,
+            nlist=nlist,
+            accuracy=0.95,
+        )
+
+        # train candidate IVF codecs
+        # benchmark them at the same nprobe
+        # keep only the space and time Pareto optima
+        codecs = self.optimize_codec(
+            result_file=f"result_ivf{nlist}_codec.json",
+            d=d,
+            training_vectors=training_vectors,
+            database_vectors=database_vectors_list[0],
+            query_vectors=query_vectors,
+            nlist=nlist,
+            nprobe=nprobe_at_95,
+            min_accuracy=min_accuracy,
+        )
+
+        # optimize coarse quantizers
+        quantizers = self.optimize_quantizer(
+            training_vectors=training_vectors,
+            query_vectors=query_vectors,
+            nlists=[4096, 8192, 16384, 32768],
+            min_accuracy=0.7,
+        )
+
+        # combine them with the codecs
+        # test them at different scales
+        for database_vectors in database_vectors_list:
+            self.optimize_ivf(
+                result_file=f"result_{database_vectors.get_filename()}json",
+                training_vectors=training_vectors,
+                database_vectors=database_vectors,
+                query_vectors=query_vectors,
+                quantizers=quantizers,
+                codecs=codecs,
+                min_accuracy=min_accuracy,
+            )
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
@@ -0,0 +1,248 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+from enum import Enum
+from multiprocessing.pool import ThreadPool
+from time import perf_counter
+
+import faiss  # @manual=//faiss/python:pyfaiss
+import numpy as np
+
+from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
+    OperatingPoints,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def timer(name, func, once=False) -> float:
+    logger.info(f"Measuring {name}")
+    t1 = perf_counter()
+    res = func()
+    t2 = perf_counter()
+    t = t2 - t1
+    repeat = 1
+    if not once and t < 1.0:
+        repeat = int(2.0 // t)
+        logger.info(
+            f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
+        )
+        t1 = perf_counter()
+        for _ in range(repeat):
+            res = func()
+        t2 = perf_counter()
+        t = (t2 - t1) / repeat
+    logger.info(f"Time for {name}: {t:.3f} seconds")
+    return res, t, repeat
+
+
+def refine_distances_knn(
+    xq: np.ndarray,
+    xb: np.ndarray,
+    I: np.ndarray,
+    metric,
+):
+    """Recompute distances between xq[i] and xb[I[i, :]]"""
+    nq, k = I.shape
+    xq = np.ascontiguousarray(xq, dtype="float32")
+    nq2, d = xq.shape
+    xb = np.ascontiguousarray(xb, dtype="float32")
+    nb, d2 = xb.shape
+    I = np.ascontiguousarray(I, dtype="int64")
+    assert nq2 == nq
+    assert d2 == d
+    D = np.empty(I.shape, dtype="float32")
+    D[:] = np.inf
+    if metric == faiss.METRIC_L2:
+        faiss.fvec_L2sqr_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    else:
+        faiss.fvec_inner_products_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    return D
+
+
+def refine_distances_range(
+    lims: np.ndarray,
+    D: np.ndarray,
+    I: np.ndarray,
+    xq: np.ndarray,
+    xb: np.ndarray,
+    metric,
+):
+    with ThreadPool(32) as pool:
+        R = pool.map(
+            lambda i: (
+                np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
+                if metric == faiss.METRIC_L2
+                else np.tensordot(
+                    xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
+                )
+            )
+            if lims[i + 1] > lims[i]
+            else [],
+            range(len(lims) - 1),
+        )
+    return np.hstack(R)
+
+
+def distance_ratio_measure(I, R, D_GT, metric):
+    sum_of_R = np.sum(np.where(I >= 0, R, 0))
+    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
+    if metric == faiss.METRIC_INNER_PRODUCT:
+        return (sum_of_R / sum_of_D_GT).item()
+    elif metric == faiss.METRIC_L2:
+        return (sum_of_D_GT / sum_of_R).item()
+    else:
+        raise RuntimeError(f"unknown metric {metric}")
+
+
+@functools.cache
+def get_cpu_info():
+    return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
+        13:
+    ].strip()
+
+
+def dict_merge(target, source):
+    for k, v in source.items():
+        if isinstance(v, dict) and k in target:
+            dict_merge(target[k], v)
+        else:
+            target[k] = v
+
+
+class Cost:
+    def __init__(self, values):
+        self.values = values
+
+    def __le__(self, other):
+        return all(
+            v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+    def __lt__(self, other):
+        return all(
+            v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+
+class ParetoMode(Enum):
+    DISABLE = 1  # no Pareto filtering
+    INDEX = 2  # index-local optima
+    GLOBAL = 3  # global optima
+
+
+class ParetoMetric(Enum):
+    TIME = 0  # time vs accuracy
+    SPACE = 1  # space vs accuracy
+    TIME_SPACE = 2  # (time, space) vs accuracy
+
+
+def range_search_recall_at_precision(experiment, precision):
+    return round(
+        max(
+            r
+            for r, p in zip(
+                experiment["range_search_pr"]["recall"],
+                experiment["range_search_pr"]["precision"],
+            )
+            if p > precision
+        ),
+        6,
+    )
+
+
+def filter_results(
+    results,
+    evaluation,
+    accuracy_metric,  # str or func
+    time_metric=None,  # func or None -> use default
+    space_metric=None,  # func or None -> use default
+    min_accuracy=0,
+    max_space=0,
+    max_time=0,
+    scaling_factor=1.0,
+    name_filter=None,  # func
+    pareto_mode=ParetoMode.DISABLE,
+    pareto_metric=ParetoMetric.TIME,
+):
+    if isinstance(accuracy_metric, str):
+        accuracy_key = accuracy_metric
+        accuracy_metric = lambda v: v[accuracy_key]
+
+    if time_metric is None:
+        time_metric = lambda v: v["time"] * scaling_factor + (
+            v["quantizer"]["time"] if "quantizer" in v else 0
+        )
+
+    if space_metric is None:
+        space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
+
+    fe = []
+    ops = {}
+    if pareto_mode == ParetoMode.GLOBAL:
+        op = OperatingPoints()
+        ops["global"] = op
+    for k, v in results["experiments"].items():
+        if f".{evaluation}" in k:
+            accuracy = accuracy_metric(v)
+            if min_accuracy > 0 and accuracy < min_accuracy:
+                continue
+            space = space_metric(v)
+            if space is None:
+                space = 0
+            if max_space > 0 and space > max_space:
+                continue
+            time = time_metric(v)
+            if max_time > 0 and time > max_time:
+                continue
+            idx_name = v["index"] + (
+                "snap"
+                if "search_params" in v and v["search_params"]["snap"] == 1
+                else ""
+            )
+            if name_filter is not None and not name_filter(idx_name):
+                continue
+            experiment = (accuracy, space, time, k, v)
+            if pareto_mode == ParetoMode.DISABLE:
+                fe.append(experiment)
+                continue
+            if pareto_mode == ParetoMode.INDEX:
+                if idx_name not in ops:
+                    ops[idx_name] = OperatingPoints()
+                op = ops[idx_name]
+            if pareto_metric == ParetoMetric.TIME:
+                op.add_operating_point(experiment, accuracy, time)
+            elif pareto_metric == ParetoMetric.SPACE:
+                op.add_operating_point(experiment, accuracy, space)
+            else:
+                op.add_operating_point(
+                    experiment, accuracy, Cost([time, space])
+                )
+
+    if ops:
+        for op in ops.values():
+            for v, _, _ in op.operating_points:
+                fe.append(v)
+
+    fe.sort()
+    return fe
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_codecs.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_codecs.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import argparse
+import os
+
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
+from faiss.benchs.bench_fw.index import IndexFromFactory
+
+logging.basicConfig(level=logging.INFO)
+
+def factory_factory(d):
+    return [
+        ("SQ4", None, 256 * (2 ** 10), None),
+        ("SQ8", None, 256 * (2 ** 10), None),
+        ("SQfp16", None, 256 * (2 ** 10), None),
+        ("ITQ64,LSH", None, 256 * (2 ** 10), None),
+        ("Pad128,ITQ128,LSH", None, 256 * (2 ** 10), None),
+        ("Pad256,ITQ256,LSH", None, 256 * (2 ** 10), None),
+    ] + [
+        (f"OPQ32_128,Residual2x14,PQ32x{b}", None, 256 * (2 ** 14), None)
+        for b in range(8, 16, 2)
+    ] + [
+        (f"PCAR{2 ** d_out},SQ{b}", None, 256 * (2 ** 10), None)
+        for d_out in range(6, 11) 
+        if 2 ** d_out <= d
+        for b in [4, 8]
+    ] + [
+        (f"OPQ{M}_{M * dim},PQ{M}x{b}", None, 256 * (2 ** b), None)
+        for M in [8, 12, 16, 32, 64, 128]
+        for dim in [2, 4, 6, 8, 12, 16]
+        if M * dim <= d
+        for b in range(8, 16, 2)
+    ] + [
+        (f"RQ{cs // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl}) 
+        for cs in [64, 128, 256, 512]
+        for b in [6, 8, 10, 12]
+        for bs in [1, 2, 4, 8, 16, 32]
+        for bl in [0, 1]
+        if cs // b > 1
+        if cs // b < 65
+        if cs < d * 8 * 2
+    ] + [
+        (f"LSQ{cs // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
+        for cs in [64, 128, 256, 512]
+        for b in [6, 8, 10, 12]
+        for eii in [2, 4, 8, 16]
+        for lg in [0, 1]
+        if cs // b > 1
+        if cs // b < 65
+        if cs < d * 8 * 2
+    ] + [
+        (f"PRQ{sub}x{cs // sub // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl})
+        for sub in [2, 3, 4, 8, 16, 32]
+        for cs in [64, 96, 128, 192, 256, 384, 512, 768, 1024, 2048]
+        for b in [6, 8, 10, 12]
+        for bs in [1, 2, 4, 8, 16, 32]
+        for bl in [0, 1]
+        if cs // sub // b > 1
+        if cs // sub // b < 65
+        if cs < d * 8 * 2
+        if d % sub == 0
+    ] + [
+        (f"PLSQ{sub}x{cs // sub // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
+        for sub in [2, 3, 4, 8, 16, 32]
+        for cs in [64, 128, 256, 512, 1024, 2048]
+        for b in [6, 8, 10, 12]
+        for eii in [2, 4, 8, 16]
+        for lg in [0, 1]
+        if cs // sub // b > 1
+        if cs // sub // b < 65
+        if cs < d * 8 * 2
+        if d % sub == 0
+    ]
+
+def run_local(rp):
+    bio, d, tablename, distance_metric = rp
+    if tablename == "contriever":
+        training_vectors=DatasetDescriptor(
+            tablename="training_set.npy"
+        )
+        database_vectors=DatasetDescriptor(
+            tablename="database1M.npy",
+        )
+        query_vectors=DatasetDescriptor(
+            tablename="queries.npy",
+        )
+    else:
+        training_vectors=DatasetDescriptor(
+            namespace="std_t", tablename=tablename,
+        )
+        database_vectors=DatasetDescriptor(
+            namespace="std_d", tablename=tablename,
+        )
+        query_vectors=DatasetDescriptor(
+            namespace="std_q", tablename=tablename,
+        )
+
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=training_vectors,
+        database_vectors=database_vectors,
+        query_vectors=query_vectors,
+        index_descs=[
+            IndexDescriptorClassic(
+                factory=factory,
+                construction_params=construction_params,
+                training_size=training_size,
+                search_params=search_params,
+            )
+            for factory, construction_params, training_size, search_params in factory_factory(d)
+        ],
+        k=1,
+        distance_metric=distance_metric,
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark(result_file="result.json", train=True, reconstruct=False, knn=False, range=False)
+
+def run(bio, d, tablename, distance_metric):
+    bio.launch_jobs(run_local, [(bio, d, tablename, distance_metric)], local=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "sift1M":
+        run(bio, 128, "sift1M", "L2")
+    elif args.experiment == "bigann":
+        run(bio, 128, "bigann1M", "L2")
+    elif args.experiment == "deep1b":
+        run(bio, 96, "deep1M", "L2")
+    elif args.experiment == "contriever":
+        run(bio, 768, "contriever", "IP")
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_ivf.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptorClassic,
+)
+
+logging.basicConfig(level=logging.INFO)
+
+
+def sift1M(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            namespace="std_d", tablename="sift1M"
+        ),
+        database_vectors=DatasetDescriptor(
+            namespace="std_d", tablename="sift1M"
+        ),
+        query_vectors=DatasetDescriptor(
+            namespace="std_q", tablename="sift1M"
+        ),
+        index_descs=[
+            IndexDescriptorClassic(
+                factory=f"IVF{2 ** nlist},Flat",
+            )
+            for nlist in range(8, 15)
+        ],
+        k=1,
+        distance_metric="L2",
+    )
+    benchmark.io = bio
+    benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False)
+
+
+def bigann(bio):
+    for scale in [1, 2, 5, 10, 20, 50]:
+        benchmark = Benchmark(
+            num_threads=32,
+            training_vectors=DatasetDescriptor(
+                namespace="std_t", tablename="bigann1M"
+            ),
+            database_vectors=DatasetDescriptor(
+                namespace="std_d", tablename=f"bigann{scale}M"
+            ),
+            query_vectors=DatasetDescriptor(
+                namespace="std_q", tablename="bigann1M"
+            ),
+            index_descs=[
+                IndexDescriptorClassic(
+                    factory=f"IVF{2 ** nlist},Flat",
+                ) for nlist in range(11, 19)
+            ] + [
+                IndexDescriptorClassic(
+                    factory=f"IVF{2 ** nlist}_HNSW32,Flat",
+                    construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
+                ) for nlist in range(11, 19)
+            ],
+            k=1,
+            distance_metric="L2",
+        )
+        benchmark.set_io(bio)
+        benchmark.benchmark(f"result{scale}.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+
+def ssnpp(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            tablename="ssnpp_training_5M.npy"
+        ),
+        database_vectors=DatasetDescriptor(
+            tablename="ssnpp_database_5M.npy"
+        ),
+        query_vectors=DatasetDescriptor(
+            tablename="ssnpp_queries_10K.npy"
+        ),
+        index_descs=[
+            IndexDescriptorClassic(
+                factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
+            ) for nlist in range(9, 16)
+        ] + [
+            IndexDescriptorClassic(
+                factory=f"IVF{2 ** nlist},Flat",
+            ) for nlist in range(9, 16)
+        ] + [
+            IndexDescriptorClassic(
+                factory=f"PQ256x4fs,Refine(SQfp16)",
+            ),
+            IndexDescriptorClassic(
+                factory=f"HNSW32",
+            ),
+        ],
+        k=1,
+        distance_metric="L2",
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "sift1M":
+        sift1M(bio)
+    elif args.experiment == "bigann":
+        bigann(bio)
+    elif args.experiment == "ssnpp":
+        ssnpp(bio)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_notebook.ipynb
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_notebook.ipynb
@@ -0,0 +1,532 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "be081589-e1b2-4569-acb7-44203e273899",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import itertools\n",
+    "from faiss.contrib.evaluation import OperatingPoints\n",
+    "from enum import Enum\n",
+    "from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO as BIO\n",
+    "from faiss.benchs.bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
+    "from copy import copy\n",
+    "import numpy as np\n",
+    "import datetime\n",
+    "import glob\n",
+    "import io\n",
+    "import json\n",
+    "from zipfile import ZipFile\n",
+    "import tabulate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "username = getpass.getuser()\n",
+    "root = f\"/home/{username}/simsearch/data/ivf/results/sift1M\"\n",
+    "results = BIO(root).read_json(\"result.json\")\n",
+    "results.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0875d269-aef4-426d-83dd-866970f43777",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "results['experiments']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f080a6e2-1565-418b-8732-4adeff03a099",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
+    "    if plot is None:\n",
+    "        plot = plt.subplot()\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    for accuracy, space, time, k, v in experiments:\n",
+    "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
+    "        if idx_name not in x:\n",
+    "            x[idx_name] = []\n",
+    "            y[idx_name] = []\n",
+    "        x[idx_name].append(accuracy)\n",
+    "        if plot_space:\n",
+    "            y[idx_name].append(space)\n",
+    "        else:\n",
+    "            y[idx_name].append(time)\n",
+    "\n",
+    "    #plt.figure(figsize=(10,6))\n",
+    "    #plt.title(accuracy_title)\n",
+    "    plot.set_xlabel(accuracy_title)\n",
+    "    plot.set_ylabel(cost_title)\n",
+    "    plot.set_yscale(\"log\")\n",
+    "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "    for index in x.keys():\n",
+    "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
+    "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61007155-5edc-449e-835e-c141a01a2ae5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# index local optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.25, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "#fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_params(p):\n",
+    "    p = copy(p)\n",
+    "    if 'snap' in p and p['snap'] == 0:\n",
+    "        del p['snap']\n",
+    "    return p\n",
+    "    \n",
+    "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
+    "                for accuracy, space, time, k, v in fr],\n",
+    "                tablefmt=\"html\",\n",
+    "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36e82084-18f6-4546-a717-163eb0224ee8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.2\n",
+    "precision = 0.2\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_range_search_pr_curves(experiments):\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    show = {\n",
+    "        'Flat': None,\n",
+    "    }\n",
+    "    for _, _, _, k, v in fr:\n",
+    "        if \".weighted\" in k: # and v['index'] in show:\n",
+    "            x[k] = v['range_search_pr']['recall']\n",
+    "            y[k] = v['range_search_pr']['precision']\n",
+    "    \n",
+    "    plt.title(\"range search recall\")\n",
+    "    plt.xlabel(\"recall\")\n",
+    "    plt.ylabel(\"precision\")\n",
+    "    for index in x.keys():\n",
+    "        plt.plot(x[index], y[index], '.', label=index)\n",
+    "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
+    "plot_range_search_pr_curves(fr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "fig.tight_layout()\n",
+    "for plot, scale in zip(plots, scales, strict=True):\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e503828c-ee61-45f7-814b-cce6461109bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = {}\n",
+    "y = {}\n",
+    "accuracy=0.9\n",
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "#fig.tight_layout()\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    scale *= 1_000_000\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    seen = set()\n",
+    "    print(scale)\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        seen.add(fact)\n",
+    "        if fact not in x:\n",
+    "            x[fact] = []\n",
+    "            y[fact] = []\n",
+    "        x[fact].append(scale)\n",
+    "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
+    "        if (exp[\"knn_intersection\"] > 0.92):\n",
+    "            print(fact)\n",
+    "            print(exp[\"search_params\"])\n",
+    "            print(exp[\"knn_intersection\"])\n",
+    "\n",
+    "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
+    "    \n",
+    "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
+    "plt.xlabel(\"database size\")\n",
+    "plt.ylabel(\"time\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "\n",
+    "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "for index in x.keys():\n",
+    "    if \"HNSW\" in index:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
+    "    else:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
+    "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"sym_recall\"\n",
+    "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_time(s):\n",
+    "    if s is None:\n",
+    "        return \"None\"\n",
+    "    s = int(s * 1000) / 1000\n",
+    "    m, s = divmod(s, 60)\n",
+    "    h, m = divmod(m, 60)\n",
+    "    d, h = divmod(h, 24)\n",
+    "    r = \"\"\n",
+    "    if d > 0:\n",
+    "        r += f\"{int(d)}d \"\n",
+    "    if h > 0:\n",
+    "        r += f\"{int(h)}h \"\n",
+    "    if m > 0:\n",
+    "        r += f\"{int(m)}m \"\n",
+    "    if s > 0 or len(r) == 0:\n",
+    "        r += f\"{s:.3f}s\"\n",
+    "    return r\n",
+    "\n",
+    "def pretty_size(s):\n",
+    "    if s > 1024 * 1024:\n",
+    "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
+    "    if s > 1024:\n",
+    "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
+    "    return f\"{s}\"\n",
+    "\n",
+    "def pretty_mse(m):\n",
+    "    if m is None:\n",
+    "        return \"None\"\n",
+    "    else:\n",
+    "        return f\"{m:.6f}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {}\n",
+    "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    d = {}\n",
+    "    data[f\"{scale}M\"] = d\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        if fact not in d:\n",
+    "            d[fact] = []\n",
+    "        d[fact].append({\n",
+    "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
+    "            \"recall\": exp[\"knn_intersection\"],\n",
+    "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
+    "        })\n",
+    "data\n",
+    "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
+    "#    json.dump(data, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"deep1b\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "results = BIO(root).read_json(f\"result.json\")\n",
+    "for k, e in results[\"experiments\"].items():\n",
+    "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
+    "        codec_size = results['indices'][e['codec']]['codec_size']\n",
+    "        training_time = results['indices'][e['codec']]['training_time']\n",
+    "        # training_size = results['indices'][e['codec']]['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "for d in data:\n",
+    "    print(d[1])\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_file(filename: str, keys):\n",
+    "    results = []\n",
+    "    with ZipFile(filename, \"r\") as zip_file:\n",
+    "        for key in keys:\n",
+    "            with zip_file.open(key, \"r\") as f:\n",
+    "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
+    "                    results.append(np.load(f))\n",
+    "                elif key in [\"P\"]:\n",
+    "                    t = io.TextIOWrapper(f)\n",
+    "                    results.append(json.load(t))\n",
+    "                else:\n",
+    "                    raise AssertionError()\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"contriever\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "for lf in glob.glob(root + '/*rec*.zip'):\n",
+    "    e, = read_file(lf, ['P'])\n",
+    "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = e['codec_meta']['sa_code_size']\n",
+    "        codec_size = e['codec_meta']['codec_size']\n",
+    "        training_time = e['codec_meta']['training_time']\n",
+    "        training_size = None # e['codec_meta']['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
+    "           eps = \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        eps = e['reconstruct_params']\n",
+    "        del eps['snap']\n",
+    "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
+    "        for k, v in e['reconstruct_params'].items():\n",
+    "            params[k] = v\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'params': params,\n",
+    "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            # 'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "# for d in data:\n",
+    "#   print(d[1])\n",
+    "\n",
+    "print(len(data))\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "faiss_binary (local)",
+   "language": "python",
+   "name": "faiss_binary_local"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_optimize.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_optimize.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor
+from faiss.benchs.bench_fw.optimize import Optimizer
+
+logging.basicConfig(level=logging.INFO)
+
+
+def bigann(bio):
+    optimizer = Optimizer(
+        distance_metric="L2",
+        num_threads=32,
+        run_local=False,
+    )
+    optimizer.set_io(bio)
+    query_vectors = DatasetDescriptor(namespace="std_q", tablename="bigann1M")
+    xt = bio.get_dataset(query_vectors)
+    optimizer.optimize(
+        d=xt.shape[1],
+        training_vectors=DatasetDescriptor(
+            namespace="std_t",
+            tablename="bigann1M",
+            num_vectors=2_000_000,
+        ),
+        database_vectors_list=[
+            DatasetDescriptor(
+                namespace="std_d",
+                tablename="bigann1M",
+            ),
+            DatasetDescriptor(namespace="std_d", tablename="bigann10M"),
+        ],
+        query_vectors=query_vectors,
+        min_accuracy=0.85,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("experiment")
+    parser.add_argument("path")
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "bigann":
+        bigann(bio)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_range.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw_range.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
+
+logging.basicConfig(level=logging.INFO)
+
+
+def ssnpp(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            tablename="training.npy",
+        ),
+        database_vectors=DatasetDescriptor(
+            tablename="database.npy",
+        ),
+        query_vectors=DatasetDescriptor(tablename="query.npy"),
+        index_descs=[
+            IndexDescriptorClassic(
+                factory="Flat",
+                range_metrics={
+                    "weighted": [
+                        [0.05, 0.971],
+                        [0.1, 0.956],
+                        [0.15, 0.923],
+                        [0.2, 0.887],
+                        [0.25, 0.801],
+                        [0.3, 0.729], 
+                        [0.35, 0.651], 
+                        [0.4, 0.55], 
+                        [0.45, 0.459], 
+                        [0.5, 0.372], 
+                        [0.55, 0.283], 
+                        [0.6, 0.189], 
+                        [0.65, 0.143], 
+                        [0.7, 0.106], 
+                        [0.75, 0.116], 
+                        [0.8, 0.088], 
+                        [0.85, 0.064],
+                        [0.9, 0.05], 
+                        [0.95, 0.04], 
+                        [1.0, 0.028], 
+                        [1.05, 0.02], 
+                        [1.1, 0.013],
+                        [1.15, 0.007], 
+                        [1.2, 0.004], 
+                        [1.3, 0],
+                    ]
+                },
+            ),
+            IndexDescriptorClassic(
+                factory="IVF262144(PQ256x4fs),PQ32",
+            ),
+        ],
+        k=10,
+        distance_metric="L2",
+        range_ref_index_desc="Flat",
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "ssnpp":
+        ssnpp(bio)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_1bn.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_1bn.py
@@ -0,0 +1,746 @@
+#! /usr/bin/env python2
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import numpy as np
+import time
+import os
+import sys
+import faiss
+import re
+
+from multiprocessing.pool import ThreadPool
+from datasets import ivecs_read
+
+####################################################################
+# Parse command line
+####################################################################
+
+
+def usage():
+    print("""
+
+Usage: bench_gpu_1bn.py dataset indextype [options]
+
+dataset: set of vectors to operate on.
+   Supported: SIFT1M, SIFT2M, ..., SIFT1000M or Deep1B
+
+indextype: any index type supported by index_factory that runs on GPU.
+
+    General options
+
+-ngpu ngpu         nb of GPUs to use (default = all)
+-tempmem N         use N bytes of temporary GPU memory
+-nocache           do not read or write intermediate files
+-float16           use 16-bit floats on the GPU side
+
+    Add options
+
+-abs N             split adds in blocks of no more than N vectors
+-max_add N         copy sharded dataset to CPU each max_add additions
+                   (to avoid memory overflows with geometric reallocations)
+-altadd            Alternative add function, where the index is not stored
+                   on GPU during add. Slightly faster for big datasets on
+                   slow GPUs
+
+    Search options
+
+-R R:              nb of replicas of the same dataset (the dataset
+                   will be copied across ngpu/R, default R=1)
+-noptables         do not use precomputed tables in IVFPQ.
+-qbs N             split queries in blocks of no more than N vectors
+-nnn N             search N neighbors for each query
+-nprobe 4,16,64    try this number of probes
+-knngraph          instead of the standard setup for the dataset,
+                   compute a k-nn graph with nnn neighbors per element
+-oI xx%d.npy       output the search result indices to this numpy file,
+                   %d will be replaced with the nprobe
+-oD xx%d.npy       output the search result distances to this file
+
+""", file=sys.stderr)
+    sys.exit(1)
+
+
+# default values
+
+dbname = None
+index_key = None
+
+ngpu = faiss.get_num_gpus()
+
+replicas = 1  # nb of replicas of sharded dataset
+add_batch_size = 32768
+query_batch_size = 16384
+nprobes = [1 << l for l in range(9)]
+knngraph = False
+use_precomputed_tables = True
+tempmem = -1  # if -1, use system default
+max_add = -1
+use_float16 = False
+use_cache = True
+nnn = 10
+altadd = False
+I_fname = None
+D_fname = None
+
+args = sys.argv[1:]
+
+while args:
+    a = args.pop(0)
+    if a == '-h': usage()
+    elif a == '-ngpu':      ngpu = int(args.pop(0))
+    elif a == '-R':         replicas = int(args.pop(0))
+    elif a == '-noptables': use_precomputed_tables = False
+    elif a == '-abs':       add_batch_size = int(args.pop(0))
+    elif a == '-qbs':       query_batch_size = int(args.pop(0))
+    elif a == '-nnn':       nnn = int(args.pop(0))
+    elif a == '-tempmem':   tempmem = int(args.pop(0))
+    elif a == '-nocache':   use_cache = False
+    elif a == '-knngraph':  knngraph = True
+    elif a == '-altadd':    altadd = True
+    elif a == '-float16':   use_float16 = True
+    elif a == '-nprobe':    nprobes = [int(x) for x in args.pop(0).split(',')]
+    elif a == '-max_add':   max_add = int(args.pop(0))
+    elif not dbname:        dbname = a
+    elif not index_key:     index_key = a
+    else:
+        print("argument %s unknown" % a, file=sys.stderr)
+        sys.exit(1)
+
+cacheroot = '/tmp/bench_gpu_1bn'
+
+if not os.path.isdir(cacheroot):
+    print("%s does not exist, creating it" % cacheroot)
+    os.mkdir(cacheroot)
+
+#################################################################
+# Small Utility Functions
+#################################################################
+
+# we mem-map the biggest files to avoid having them in memory all at
+# once
+
+def mmap_fvecs(fname):
+    x = np.memmap(fname, dtype='int32', mode='r')
+    d = x[0]
+    return x.view('float32').reshape(-1, d + 1)[:, 1:]
+
+def mmap_bvecs(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def rate_limited_imap(f, l):
+    """A threaded imap that does not produce elements faster than they
+    are consumed"""
+    pool = ThreadPool(1)
+    res = None
+    for i in l:
+        res_next = pool.apply_async(f, (i, ))
+        if res:
+            yield res.get()
+        res = res_next
+    yield res.get()
+
+
+class IdentPreproc:
+    """a pre-processor is either a faiss.VectorTransform or an IndentPreproc"""
+
+    def __init__(self, d):
+        self.d_in = self.d_out = d
+
+    def apply_py(self, x):
+        return x
+
+
+def sanitize(x):
+    """ convert array to a c-contiguous float array """
+    return np.ascontiguousarray(x.astype('float32'))
+
+
+def dataset_iterator(x, preproc, bs):
+    """ iterate over the lines of x in blocks of size bs"""
+
+    nb = x.shape[0]
+    block_ranges = [(i0, min(nb, i0 + bs))
+                    for i0 in range(0, nb, bs)]
+
+    def prepare_block(i01):
+        i0, i1 = i01
+        xb = sanitize(x[i0:i1])
+        return i0, preproc.apply_py(xb)
+
+    return rate_limited_imap(prepare_block, block_ranges)
+
+
+def eval_intersection_measure(gt_I, I):
+    """ measure intersection measure (used for knngraph)"""
+    inter = 0
+    rank = I.shape[1]
+    assert gt_I.shape[1] >= rank
+    for q in range(nq_gt):
+        inter += faiss.ranklist_intersection_size(
+            rank, faiss.swig_ptr(gt_I[q, :]),
+            rank, faiss.swig_ptr(I[q, :].astype('int64')))
+    return inter / float(rank * nq_gt)
+
+
+#################################################################
+# Prepare dataset
+#################################################################
+
+print("Preparing dataset", dbname)
+
+if dbname.startswith('SIFT'):
+    # SIFT1M to SIFT1000M
+    dbsize = int(dbname[4:-1])
+    xb = mmap_bvecs('bigann/bigann_base.bvecs')
+    xq = mmap_bvecs('bigann/bigann_query.bvecs')
+    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
+
+    # trim xb to correct size
+    xb = xb[:dbsize * 1000 * 1000]
+
+    gt_I = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
+
+elif dbname == 'Deep1B':
+    xb = mmap_fvecs('deep1b/base.fvecs')
+    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
+    xt = mmap_fvecs('deep1b/learn.fvecs')
+    # deep1B's train is is outrageously big
+    xt = xt[:10 * 1000 * 1000]
+    gt_I = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
+
+else:
+    print('unknown dataset', dbname, file=sys.stderr)
+    sys.exit(1)
+
+
+if knngraph:
+    # convert to knn-graph dataset
+    xq = xb
+    xt = xb
+    # we compute the ground-truth on this number of queries for validation
+    nq_gt = 10000
+    gt_sl = 100
+
+    # ground truth will be computed below
+    gt_I = None
+
+
+print("sizes: B %s Q %s T %s gt %s" % (
+    xb.shape, xq.shape, xt.shape,
+    gt_I.shape if gt_I is not None else None))
+
+
+
+#################################################################
+# Parse index_key and set cache files
+#
+# The index_key is a valid factory key that would work, but we
+# decompose the training to do it faster
+#################################################################
+
+
+pat = re.compile('(OPQ[0-9]+(_[0-9]+)?,|PCAR[0-9]+,)?' +
+                 '(IVF[0-9]+),' +
+                 '(PQ[0-9]+|Flat)')
+
+matchobject = pat.match(index_key)
+
+assert matchobject, 'could not parse ' + index_key
+
+mog = matchobject.groups()
+
+preproc_str = mog[0]
+ivf_str = mog[2]
+pqflat_str = mog[3]
+
+ncent = int(ivf_str[3:])
+
+prefix = ''
+
+if knngraph:
+    gt_cachefile = '%s/BK_gt_%s.npy' % (cacheroot, dbname)
+    prefix = 'BK_'
+    # files must be kept distinct because the training set is not the
+    # same for the knngraph
+
+if preproc_str:
+    preproc_cachefile = '%s/%spreproc_%s_%s.vectrans' % (
+        cacheroot, prefix, dbname, preproc_str[:-1])
+else:
+    preproc_cachefile = None
+    preproc_str = ''
+
+cent_cachefile = '%s/%scent_%s_%s%s.npy' % (
+    cacheroot, prefix, dbname, preproc_str, ivf_str)
+
+index_cachefile = '%s/%s%s_%s%s,%s.index' % (
+    cacheroot, prefix, dbname, preproc_str, ivf_str, pqflat_str)
+
+
+if not use_cache:
+    preproc_cachefile = None
+    cent_cachefile = None
+    index_cachefile = None
+
+print("cachefiles:")
+print(preproc_cachefile)
+print(cent_cachefile)
+print(index_cachefile)
+
+
+#################################################################
+# Wake up GPUs
+#################################################################
+
+print("preparing resources for %d GPUs" % ngpu)
+
+gpu_resources = []
+
+for i in range(ngpu):
+    res = faiss.StandardGpuResources()
+    if tempmem >= 0:
+        res.setTempMemory(tempmem)
+    gpu_resources.append(res)
+
+
+def make_vres_vdev(i0=0, i1=-1):
+    " return vectors of device ids and resources useful for gpu_multiple"
+    vres = faiss.GpuResourcesVector()
+    vdev = faiss.IntVector()
+    if i1 == -1:
+        i1 = ngpu
+    for i in range(i0, i1):
+        vdev.push_back(i)
+        vres.push_back(gpu_resources[i])
+    return vres, vdev
+
+
+#################################################################
+# Prepare ground truth (for the knngraph)
+#################################################################
+
+
+def compute_GT():
+    print("compute GT")
+    t0 = time.time()
+
+    gt_I = np.zeros((nq_gt, gt_sl), dtype='int64')
+    gt_D = np.zeros((nq_gt, gt_sl), dtype='float32')
+    heaps = faiss.float_maxheap_array_t()
+    heaps.k = gt_sl
+    heaps.nh = nq_gt
+    heaps.val = faiss.swig_ptr(gt_D)
+    heaps.ids = faiss.swig_ptr(gt_I)
+    heaps.heapify()
+    bs = 10 ** 5
+
+    n, d = xb.shape
+    xqs = sanitize(xq[:nq_gt])
+
+    db_gt = faiss.IndexFlatL2(d)
+    vres, vdev = make_vres_vdev()
+    db_gt_gpu = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, db_gt)
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs):
+        db_gt_gpu.add(xsl)
+        D, I = db_gt_gpu.search(xqs, gt_sl)
+        I += i0
+        heaps.addn_with_ids(
+            gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl)
+        db_gt_gpu.reset()
+        print("\r   %d/%d, %.3f s" % (i0, n, time.time() - t0), end=' ')
+    print()
+    heaps.reorder()
+
+    print("GT time: %.3f s" % (time.time() - t0))
+    return gt_I
+
+
+if knngraph:
+
+    if gt_cachefile and os.path.exists(gt_cachefile):
+        print("load GT", gt_cachefile)
+        gt_I = np.load(gt_cachefile)
+    else:
+        gt_I = compute_GT()
+        if gt_cachefile:
+            print("store GT", gt_cachefile)
+            np.save(gt_cachefile, gt_I)
+
+#################################################################
+# Prepare the vector transformation object (pure CPU)
+#################################################################
+
+
+def train_preprocessor():
+    print("train preproc", preproc_str)
+    d = xt.shape[1]
+    t0 = time.time()
+    if preproc_str.startswith('OPQ'):
+        fi = preproc_str[3:-1].split('_')
+        m = int(fi[0])
+        dout = int(fi[1]) if len(fi) == 2 else d
+        preproc = faiss.OPQMatrix(d, m, dout)
+    elif preproc_str.startswith('PCAR'):
+        dout = int(preproc_str[4:-1])
+        preproc = faiss.PCAMatrix(d, dout, 0, True)
+    else:
+        assert False
+    preproc.train(sanitize(xt[:1000000]))
+    print("preproc train done in %.3f s" % (time.time() - t0))
+    return preproc
+
+
+def get_preprocessor():
+    if preproc_str:
+        if not preproc_cachefile or not os.path.exists(preproc_cachefile):
+            preproc = train_preprocessor()
+            if preproc_cachefile:
+                print("store", preproc_cachefile)
+                faiss.write_VectorTransform(preproc, preproc_cachefile)
+        else:
+            print("load", preproc_cachefile)
+            preproc = faiss.read_VectorTransform(preproc_cachefile)
+    else:
+        d = xb.shape[1]
+        preproc = IdentPreproc(d)
+    return preproc
+
+
+#################################################################
+# Prepare the coarse quantizer
+#################################################################
+
+
+def train_coarse_quantizer(x, k, preproc):
+    d = preproc.d_out
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    # clus.niter = 2
+    clus.max_points_per_centroid = 10000000
+
+    print("apply preproc on shape", x.shape, 'k=', k)
+    t0 = time.time()
+    x = preproc.apply_py(sanitize(x))
+    print("   preproc %.3f s output shape %s" % (
+        time.time() - t0, x.shape))
+
+    vres, vdev = make_vres_vdev()
+    index = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, faiss.IndexFlatL2(d))
+
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    return centroids.reshape(k, d)
+
+
+def prepare_coarse_quantizer(preproc):
+
+    if cent_cachefile and os.path.exists(cent_cachefile):
+        print("load centroids", cent_cachefile)
+        centroids = np.load(cent_cachefile)
+    else:
+        nt = max(1000000, 256 * ncent)
+        print("train coarse quantizer...")
+        t0 = time.time()
+        centroids = train_coarse_quantizer(xt[:nt], ncent, preproc)
+        print("Coarse train time: %.3f s" % (time.time() - t0))
+        if cent_cachefile:
+            print("store centroids", cent_cachefile)
+            np.save(cent_cachefile, centroids)
+
+    coarse_quantizer = faiss.IndexFlatL2(preproc.d_out)
+    coarse_quantizer.add(centroids)
+
+    return coarse_quantizer
+
+
+#################################################################
+# Make index and add elements to it
+#################################################################
+
+
+def prepare_trained_index(preproc):
+
+    coarse_quantizer = prepare_coarse_quantizer(preproc)
+    d = preproc.d_out
+    if pqflat_str == 'Flat':
+        print("making an IVFFlat index")
+        idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, ncent,
+                                       faiss.METRIC_L2)
+    else:
+        m = int(pqflat_str[2:])
+        assert m < 56 or use_float16, "PQ%d will work only with -float16" % m
+        print("making an IVFPQ index, m = ", m)
+        idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, ncent, m, 8)
+
+    coarse_quantizer.this.disown()
+    idx_model.own_fields = True
+
+    # finish training on CPU
+    t0 = time.time()
+    print("Training vector codes")
+    x = preproc.apply_py(sanitize(xt[:1000000]))
+    idx_model.train(x)
+    print("  done %.3f s" % (time.time() - t0))
+
+    return idx_model
+
+
+def compute_populated_index(preproc):
+    """Add elements to a sharded index. Return the index and if available
+    a sharded gpu_index that contains the same data. """
+
+    indexall = prepare_trained_index(preproc)
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = use_float16
+    co.useFloat16CoarseQuantizer = False
+    co.usePrecomputed = use_precomputed_tables
+    co.indicesOptions = faiss.INDICES_CPU
+    co.verbose = True
+    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
+    co.shard = True
+    assert co.shard_type in (0, 1, 2)
+    vres, vdev = make_vres_vdev()
+    gpu_index = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, indexall, co)
+
+    print("add...")
+    t0 = time.time()
+    nb = xb.shape[0]
+    for i0, xs in dataset_iterator(xb, preproc, add_batch_size):
+        i1 = i0 + xs.shape[0]
+        gpu_index.add_with_ids(xs, np.arange(i0, i1))
+        if max_add > 0 and gpu_index.ntotal > max_add:
+            print("Flush indexes to CPU")
+            for i in range(ngpu):
+                index_src_gpu = faiss.downcast_index(gpu_index.at(i))
+                index_src = faiss.index_gpu_to_cpu(index_src_gpu)
+                print("  index %d size %d" % (i, index_src.ntotal))
+                index_src.copy_subset_to(indexall, 0, 0, nb)
+                index_src_gpu.reset()
+                index_src_gpu.reserveMemory(max_add)
+            gpu_index.sync_with_shard_indexes()
+
+        print('\r%d/%d (%.3f s)  ' % (
+            i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print("Add time: %.3f s" % (time.time() - t0))
+
+    print("Aggregate indexes to CPU")
+    t0 = time.time()
+
+    if hasattr(gpu_index, 'at'):
+        # it is a sharded index
+        for i in range(ngpu):
+            index_src = faiss.index_gpu_to_cpu(gpu_index.at(i))
+            print("  index %d size %d" % (i, index_src.ntotal))
+            index_src.copy_subset_to(indexall, 0, 0, nb)
+    else:
+        # simple index
+        index_src = faiss.index_gpu_to_cpu(gpu_index)
+        index_src.copy_subset_to(indexall, 0, 0, nb)
+
+    print("  done in %.3f s" % (time.time() - t0))
+
+    if max_add > 0:
+        # it does not contain all the vectors
+        gpu_index = None
+
+    return gpu_index, indexall
+
+def compute_populated_index_2(preproc):
+
+    indexall = prepare_trained_index(preproc)
+
+    # set up a 3-stage pipeline that does:
+    # - stage 1: load + preproc
+    # - stage 2: assign on GPU
+    # - stage 3: add to index
+
+    stage1 = dataset_iterator(xb, preproc, add_batch_size)
+
+    vres, vdev = make_vres_vdev()
+    coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
+        vres, vdev, indexall.quantizer)
+
+    def quantize(args):
+        (i0, xs) = args
+        _, assign = coarse_quantizer_gpu.search(xs, 1)
+        return i0, xs, assign.ravel()
+
+    stage2 = rate_limited_imap(quantize, stage1)
+
+    print("add...")
+    t0 = time.time()
+    nb = xb.shape[0]
+
+    for i0, xs, assign in stage2:
+        i1 = i0 + xs.shape[0]
+        if indexall.__class__ == faiss.IndexIVFPQ:
+            indexall.add_core_o(i1 - i0, faiss.swig_ptr(xs),
+                                None, None, faiss.swig_ptr(assign))
+        elif indexall.__class__ == faiss.IndexIVFFlat:
+            indexall.add_core(i1 - i0, faiss.swig_ptr(xs), None,
+                              faiss.swig_ptr(assign))
+        else:
+            assert False
+
+        print('\r%d/%d (%.3f s)  ' % (
+            i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print("Add time: %.3f s" % (time.time() - t0))
+
+    return None, indexall
+
+
+
+def get_populated_index(preproc):
+
+    if not index_cachefile or not os.path.exists(index_cachefile):
+        if not altadd:
+            gpu_index, indexall = compute_populated_index(preproc)
+        else:
+            gpu_index, indexall = compute_populated_index_2(preproc)
+        if index_cachefile:
+            print("store", index_cachefile)
+            faiss.write_index(indexall, index_cachefile)
+    else:
+        print("load", index_cachefile)
+        indexall = faiss.read_index(index_cachefile)
+        gpu_index = None
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = use_float16
+    co.useFloat16CoarseQuantizer = False
+    co.usePrecomputed = use_precomputed_tables
+    co.indicesOptions = 0
+    co.verbose = True
+    co.shard = True    # the replicas will be made "manually"
+    t0 = time.time()
+    print("CPU index contains %d vectors, move to GPU" % indexall.ntotal)
+    if replicas == 1:
+
+        if not gpu_index:
+            print("copying loaded index to GPUs")
+            vres, vdev = make_vres_vdev()
+            index = faiss.index_cpu_to_gpu_multiple(
+                vres, vdev, indexall, co)
+        else:
+            index = gpu_index
+
+    else:
+        del gpu_index # We override the GPU index
+
+        print("Copy CPU index to %d sharded GPU indexes" % replicas)
+
+        index = faiss.IndexReplicas()
+
+        for i in range(replicas):
+            gpu0 = ngpu * i / replicas
+            gpu1 = ngpu * (i + 1) / replicas
+            vres, vdev = make_vres_vdev(gpu0, gpu1)
+
+            print("   dispatch to GPUs %d:%d" % (gpu0, gpu1))
+
+            index1 = faiss.index_cpu_to_gpu_multiple(
+                vres, vdev, indexall, co)
+            index1.this.disown()
+            index.addIndex(index1)
+        index.own_fields = True
+    del indexall
+    print("move to GPU done in %.3f s" % (time.time() - t0))
+    return index
+
+
+
+#################################################################
+# Perform search
+#################################################################
+
+
+def eval_dataset(index, preproc):
+
+    ps = faiss.GpuParameterSpace()
+    ps.initialize(index)
+
+    nq_gt = gt_I.shape[0]
+    print("search...")
+    sl = query_batch_size
+    nq = xq.shape[0]
+    for nprobe in nprobes:
+        ps.set_index_parameter(index, 'nprobe', nprobe)
+        t0 = time.time()
+
+        if sl == 0:
+            D, I = index.search(preproc.apply_py(sanitize(xq)), nnn)
+        else:
+            I = np.empty((nq, nnn), dtype='int32')
+            D = np.empty((nq, nnn), dtype='float32')
+
+            inter_res = ''
+
+            for i0, xs in dataset_iterator(xq, preproc, sl):
+                print('\r%d/%d (%.3f s%s)   ' % (
+                    i0, nq, time.time() - t0, inter_res), end=' ')
+                sys.stdout.flush()
+
+                i1 = i0 + xs.shape[0]
+                Di, Ii = index.search(xs, nnn)
+
+                I[i0:i1] = Ii
+                D[i0:i1] = Di
+
+                if knngraph and not inter_res and i1 >= nq_gt:
+                    ires = eval_intersection_measure(
+                        gt_I[:, :nnn], I[:nq_gt])
+                    inter_res = ', %.4f' % ires
+
+        t1 = time.time()
+        if knngraph:
+            ires = eval_intersection_measure(gt_I[:, :nnn], I[:nq_gt])
+            print("  probe=%-3d: %.3f s rank-%d intersection results: %.4f" % (
+                nprobe, t1 - t0, nnn, ires))
+        else:
+            print("  probe=%-3d: %.3f s" % (nprobe, t1 - t0), end=' ')
+            gtc = gt_I[:, :1]
+            nq = xq.shape[0]
+            for rank in 1, 10, 100:
+                if rank > nnn: continue
+                nok = (I[:, :rank] == gtc).sum()
+                print("1-R@%d: %.4f" % (rank, nok / float(nq)), end=' ')
+            print()
+        if I_fname:
+            I_fname_i = I_fname % I
+            print("storing", I_fname_i)
+            np.save(I, I_fname_i)
+        if D_fname:
+            D_fname_i = I_fname % I
+            print("storing", D_fname_i)
+            np.save(D, D_fname_i)
+
+
+#################################################################
+# Driver
+#################################################################
+
+
+preproc = get_preprocessor()
+
+index = get_populated_index(preproc)
+
+eval_dataset(index, preproc)
+
+# make sure index is deleted before the resources
+del index
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_sift1m.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_gpu_sift1m.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+import numpy as np
+import pdb
+
+import faiss
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+# we need only a StandardGpuResources per GPU
+res = faiss.StandardGpuResources()
+
+
+#################################################################
+#  Exact search experiment
+#################################################################
+
+print("============ Exact search")
+
+flat_config = faiss.GpuIndexFlatConfig()
+flat_config.device = 0
+
+index = faiss.GpuIndexFlatL2(res, d, flat_config)
+
+print("add vectors to index")
+
+index.add(xb)
+
+print("warmup")
+
+index.search(xq, 123)
+
+print("benchmark")
+
+for lk in range(11):
+    k = 1 << lk
+    t, r = evaluate(index, xq, gt, k)
+
+    # the recall should be 1 at all times
+    print("k=%d %.3f ms, R@1 %.4f" % (k, t, r[1]))
+
+
+#################################################################
+#  Approximate search experiment
+#################################################################
+
+print("============ Approximate search")
+
+index = faiss.index_factory(d, "IVF4096,PQ64")
+
+# faster, uses more memory
+# index = faiss.index_factory(d, "IVF16384,Flat")
+
+co = faiss.GpuClonerOptions()
+
+# here we are using a 64-byte PQ, so we must set the lookup tables to
+# 16 bit float (this is due to the limited temporary memory).
+co.useFloat16 = True
+
+index = faiss.index_cpu_to_gpu(res, 0, index, co)
+
+print("train")
+
+index.train(xt)
+
+print("add vectors to index")
+
+index.add(xb)
+
+print("warmup")
+
+index.search(xq, 123)
+
+print("benchmark")
+
+for lnprobe in range(10):
+    nprobe = 1 << lnprobe
+    index.nprobe
+    index.nprobe = nprobe
+    t, r = evaluate(index, xq, gt, 100)
+
+    print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_computer.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_computer.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <vector>
+
+#include <cinttypes>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+using namespace faiss;
+
+// These implementations are currently slower than HammingComputerDefault so
+// they are not in the main faiss anymore.
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() = default;
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() = default;
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 4;
+    }
+};
+
+template <class T>
+void hamming_cpt_test(
+        int code_size,
+        uint8_t* data1,
+        uint8_t* data2,
+        int n,
+        int* rst) {
+    T computer(data1, code_size);
+    for (int i = 0; i < n; i++) {
+        rst[i] = computer.hamming(data2);
+        data2 += code_size;
+    }
+}
+
+template <int CODE_SIZE_IN_BITS>
+void hamming_func_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < 10; irun++) {
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint64_t* data1_ptr =
+                        (const uint64_t*)(x1 + i * CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint64_t* data2_ptr =
+                            (const uint64_t*)(x2 + j * CODE_SIZE_IN_BYTES);
+
+                    uint64_t code = faiss::hamming<CODE_SIZE_IN_BITS>(
+                            data1_ptr, data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("hamming<%d>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BITS,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
+template <typename HammingComputerT, int CODE_SIZE_IN_BITS>
+void hamming_computer_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < nruns; irun++) {
+        sumx = 0;
+        xorx = 0;
+
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint8_t* data1_ptr = x1 + i * CODE_SIZE_IN_BYTES;
+                HammingComputerT hc(data1_ptr, CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint8_t* data2_ptr = x2 + j * CODE_SIZE_IN_BYTES;
+                    uint64_t code = hc.hamming(data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("HammingComputer<%zd>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BYTES,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
+int main() {
+    size_t n = 4 * 1000 * 1000;
+
+    std::vector<size_t> code_size = {128, 256, 512, 1000};
+
+    std::vector<uint8_t> x(n * code_size.back());
+    byte_rand(x.data(), n, 12345);
+
+    int nrun = 100;
+    for (size_t cs : code_size) {
+        printf("benchmark with code_size=%zd n=%zd nrun=%d\n", cs, n, nrun);
+
+        double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
+#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
+        {
+            std::vector<int> rst_m4(n);
+            std::vector<int> rst_m8(n);
+            std::vector<int> rst_default(n);
+
+#pragma omp for
+            for (int run = 0; run < nrun; run++) {
+                double t0, t1, t2, t3;
+                t0 = getmillisecs();
+
+                // new implem from Zilliz
+                hamming_cpt_test<HammingComputerDefault>(
+                        cs, x.data(), x.data(), n, rst_default.data());
+                t1 = getmillisecs();
+
+                // M8
+                hamming_cpt_test<HammingComputerM8>(
+                        cs, x.data(), x.data(), n, rst_m8.data());
+                t2 = getmillisecs();
+
+                // M4
+                hamming_cpt_test<HammingComputerM4>(
+                        cs, x.data(), x.data(), n, rst_m4.data());
+                t3 = getmillisecs();
+
+                tot_t1 += t1 - t0;
+                tot_t2 += t2 - t1;
+                tot_t3 += t3 - t2;
+            }
+
+            for (int i = 0; i < n; i++) {
+                FAISS_THROW_IF_NOT_FMT(
+                        (rst_m4[i] == rst_m8[i] && rst_m4[i] == rst_default[i]),
+                        "wrong result i=%d, m4 %d m8 %d default %d",
+                        i,
+                        rst_m4[i],
+                        rst_m8[i],
+                        rst_default[i]);
+            }
+        }
+
+        printf("Hamming_Dft  implem: %.3f ms\n", tot_t1 / nrun);
+        printf("Hamming_M8   implem: %.3f ms\n", tot_t2 / nrun);
+        printf("Hamming_M4   implem: %.3f ms\n", tot_t3 / nrun);
+    }
+
+    // evaluate various hamming<>() function calls
+    const size_t MAX_HAMMING_FUNC_CODE_SIZE = 512;
+
+    const size_t n1 = 65536;
+    const size_t n2 = 16384;
+
+    std::vector<uint8_t> x1(n1 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    std::vector<uint8_t> x2(n2 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    byte_rand(x1.data(), x1.size(), 12345);
+    byte_rand(x2.data(), x2.size(), 23456);
+
+    // These two values serve as a kind of CRC.
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+    hamming_func_test<64>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<128>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<256>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<384>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<512>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various HammingComputerXX
+    hamming_computer_test<faiss::HammingComputer4, 32>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer20, 160>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer64, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various GenHammingDistanceComputerXX
+    hamming_computer_test<faiss::GenHammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    hamming_computer_test<faiss::GenHammingComputerM8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    return 0;
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_knn.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hamming_knn.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import numpy as np
+import faiss
+
+if __name__ == "__main__":
+    faiss.omp_set_num_threads(1)
+
+    for d in 4, 8, 16, 13:
+        nq = 10000
+        nb = 30000
+        print('Bits per vector = 8 *', d)
+        xq = faiss.randint((nq, d // 4), seed=1234, vmax=256**4).view('uint8')
+        xb = faiss.randint((nb, d // 4), seed=1234, vmax=256**4).view('uint8')
+        for variant in "hc", "mc":
+            print(f"{variant=:}", end="\t")
+            for k in 1, 4, 16, 64, 256:
+                times = []
+                for _run in range(5):
+                    t0 = time.time()
+                    D, I = faiss.knn_hamming(xq, xb, k, variant=variant)
+                    t1 = time.time()
+                    times.append(t1 - t0)
+                print(f'| {k=:} t={np.mean(times):.3f} s ± {np.std(times):.3f} ', flush=True, end="")
+            print()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_heap_replace.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_heap_replace.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+using namespace faiss;
+
+void addn_default(
+        size_t n,
+        size_t k,
+        const float* x,
+        int64_t* heap_ids,
+        float* heap_val) {
+    for (size_t i = 0; i < k; i++) {
+        minheap_push(i + 1, heap_val, heap_ids, x[i], i);
+    }
+
+    for (size_t i = k; i < n; i++) {
+        if (x[i] > heap_val[0]) {
+            minheap_pop(k, heap_val, heap_ids);
+            minheap_push(k, heap_val, heap_ids, x[i], i);
+        }
+    }
+
+    minheap_reorder(k, heap_val, heap_ids);
+}
+
+void addn_replace(
+        size_t n,
+        size_t k,
+        const float* x,
+        int64_t* heap_ids,
+        float* heap_val) {
+    for (size_t i = 0; i < k; i++) {
+        minheap_push(i + 1, heap_val, heap_ids, x[i], i);
+    }
+
+    for (size_t i = k; i < n; i++) {
+        if (x[i] > heap_val[0]) {
+            minheap_replace_top(k, heap_val, heap_ids, x[i], i);
+        }
+    }
+
+    minheap_reorder(k, heap_val, heap_ids);
+}
+
+void addn_func(
+        size_t n,
+        size_t k,
+        const float* x,
+        int64_t* heap_ids,
+        float* heap_val) {
+    minheap_heapify(k, heap_val, heap_ids);
+
+    minheap_addn(k, heap_val, heap_ids, x, nullptr, n);
+
+    minheap_reorder(k, heap_val, heap_ids);
+}
+
+int main() {
+    size_t n = 10 * 1000 * 1000;
+
+    std::vector<size_t> ks({20, 50, 100, 200, 500, 1000, 2000, 5000});
+
+    std::vector<float> x(n);
+    float_randn(x.data(), n, 12345);
+
+    int nrun = 100;
+    for (size_t k : ks) {
+        printf("benchmark with k=%zd n=%zd nrun=%d\n", k, n, nrun);
+        FAISS_THROW_IF_NOT(k < n);
+
+        double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
+#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
+        {
+            std::vector<float> heap_dis(k);
+            std::vector<float> heap_dis_2(k);
+            std::vector<float> heap_dis_3(k);
+
+            std::vector<int64_t> heap_ids(k);
+            std::vector<int64_t> heap_ids_2(k);
+            std::vector<int64_t> heap_ids_3(k);
+
+#pragma omp for
+            for (int run = 0; run < nrun; run++) {
+                double t0, t1, t2, t3;
+
+                t0 = getmillisecs();
+
+                // default implem
+                addn_default(n, k, x.data(), heap_ids.data(), heap_dis.data());
+                t1 = getmillisecs();
+
+                // new implem from Zilliz
+                addn_replace(
+                        n, k, x.data(), heap_ids_2.data(), heap_dis_2.data());
+                t2 = getmillisecs();
+
+                // with addn
+                addn_func(n, k, x.data(), heap_ids_3.data(), heap_dis_3.data());
+                t3 = getmillisecs();
+
+                tot_t1 += t1 - t0;
+                tot_t2 += t2 - t1;
+                tot_t3 += t3 - t2;
+            }
+
+            for (size_t i = 0; i < k; i++) {
+                FAISS_THROW_IF_NOT_FMT(
+                        heap_ids[i] == heap_ids_2[i],
+                        "i=%ld (%ld, %g) != (%ld, %g)",
+                        i,
+                        size_t(heap_ids[i]),
+                        heap_dis[i],
+                        size_t(heap_ids_2[i]),
+                        heap_dis_2[i]);
+                FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_2[i]);
+            }
+
+            for (size_t i = 0; i < k; i++) {
+                FAISS_THROW_IF_NOT(heap_ids[i] == heap_ids_3[i]);
+                FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_3[i]);
+            }
+        }
+        printf("default implem: %.3f ms\n", tot_t1 / nrun);
+        printf("replace implem: %.3f ms\n", tot_t2 / nrun);
+        printf("addn    implem: %.3f ms\n", tot_t3 / nrun);
+    }
+    return 0;
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hnsw.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hnsw.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import sys
+import numpy as np
+import faiss
+
+try:
+    from faiss.contrib.datasets_fb import DatasetSIFT1M
+except ImportError:
+    from faiss.contrib.datasets import DatasetSIFT1M
+
+# from datasets import load_sift1M
+
+
+k = int(sys.argv[1])
+todo = sys.argv[2:]
+
+print("load data")
+
+# xb, xq, xt, gt = load_sift1M()
+
+ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+xt = ds.get_train()
+
+nq, d = xq.shape
+
+if todo == []:
+    todo = 'hnsw hnsw_sq ivf ivf_hnsw_quantizer kmeans kmeans_hnsw nsg'.split()
+
+
+def evaluate(index):
+    # for timing with a single core
+    # faiss.omp_set_num_threads(1)
+
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+
+    missing_rate = (I == -1).sum() / float(k * nq)
+    recall_at_1 = (I == gt[:, :1]).sum() / float(nq)
+    print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % (
+        (t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate))
+
+
+if 'hnsw' in todo:
+
+    print("Testing HNSW Flat")
+
+    index = faiss.IndexHNSWFlat(d, 32)
+
+    # training is not needed
+
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+
+    print("add")
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+
+    print("search")
+    for efSearch in 16, 32, 64, 128, 256:
+        for bounded_queue in [True, False]:
+            print("efSearch", efSearch, "bounded queue", bounded_queue, end=' ')
+            index.hnsw.search_bounded_queue = bounded_queue
+            index.hnsw.efSearch = efSearch
+            evaluate(index)
+
+if 'hnsw_sq' in todo:
+
+    print("Testing HNSW with a scalar quantizer")
+    # also set M so that the vectors and links both use 128 bytes per
+    # entry (total 256 bytes)
+    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)
+
+    print("training")
+    # training for the scalar quantizer
+    index.train(xt)
+
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+
+    print("add")
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+
+    print("search")
+    for efSearch in 16, 32, 64, 128, 256:
+        print("efSearch", efSearch, end=' ')
+        index.hnsw.efSearch = efSearch
+        evaluate(index)
+
+if 'ivf' in todo:
+
+    print("Testing IVF Flat (baseline)")
+    quantizer = faiss.IndexFlatL2(d)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+
+    # to see progress
+    index.verbose = True
+
+    print("training")
+    index.train(xt)
+
+    print("add")
+    index.add(xb)
+
+    print("search")
+    for nprobe in 1, 4, 16, 64, 256:
+        print("nprobe", nprobe, end=' ')
+        index.nprobe = nprobe
+        evaluate(index)
+
+if 'ivf_hnsw_quantizer' in todo:
+
+    print("Testing IVF Flat with HNSW quantizer")
+    quantizer = faiss.IndexHNSWFlat(d, 32)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+    index.quantizer_trains_alone = 2
+
+    # to see progress
+    index.verbose = True
+
+    print("training")
+    index.train(xt)
+
+    print("add")
+    index.add(xb)
+
+    print("search")
+    quantizer.hnsw.efSearch = 64
+    for nprobe in 1, 4, 16, 64, 256:
+        print("nprobe", nprobe, end=' ')
+        index.nprobe = nprobe
+        evaluate(index)
+
+# Bonus: 2 kmeans tests
+
+if 'kmeans' in todo:
+    print("Performing kmeans on sift1M database vectors (baseline)")
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexFlatL2(d)
+    clus.train(xb, index)
+
+
+if 'kmeans_hnsw' in todo:
+    print("Performing kmeans on sift1M using HNSW assignment")
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexHNSWFlat(d, 32)
+    # increase the default efSearch, otherwise the number of empty
+    # clusters is too high.
+    index.hnsw.efSearch = 128
+    clus.train(xb, index)
+
+if 'nsg' in todo:
+
+    print("Testing NSG Flat")
+
+    index = faiss.IndexNSGFlat(d, 32)
+    index.build_type = 1
+    # training is not needed
+
+    # this is the default, higher is more accurate and slower to
+    # construct
+
+    print("add")
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+
+    print("search")
+    for search_L in -1, 16, 32, 64, 128, 256:
+        print("search_L", search_L, end=' ')
+        index.nsg.search_L = search_L
+        evaluate(index)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hybrid_cpu_gpu.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_hybrid_cpu_gpu.py
@@ -0,0 +1,599 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import pickle
+import time
+from multiprocessing.pool import ThreadPool
+
+import faiss
+import numpy as np
+
+try:
+    from faiss.contrib.datasets_fb import dataset_from_name
+except ImportError:
+    from faiss.contrib.datasets import dataset_from_name
+
+from faiss.contrib.evaluation import OperatingPointsWithRanges
+from faiss.contrib.ivf_tools import replace_ivf_quantizer
+
+#################################################################
+# Preassigned search functions
+#################################################################
+
+
+def search_preassigned(xq, k, index, quantizer, batch_size=0):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index.
+    """
+    n, d = xq.shape
+    nprobe = index.nprobe
+    if batch_size == 0:
+        batch_size = n + 1
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    for i0 in range(0, n, batch_size):
+        Dq, Iq = quantizer.search(xq[i0:i0 + batch_size], nprobe)
+        D[i0:i0 + batch_size], I[i0:i0 + batch_size] = \
+            index.search_preassigned(xq[i0:i0 + batch_size], k, Iq, Dq)
+    return D, I
+
+
+def tiled_search_preassigned(xq, k, index, quantizer, batch_size=32768):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index. Allow overlapping between coarse quantization and
+    scanning the inverted lists.
+    """
+    n, d = xq.shape
+
+    # prepare a thread that will run the quantizer
+    qq_pool = ThreadPool(1)
+    nprobe = index.nprobe
+
+    def coarse_quant(i0):
+        if i0 >= n:
+            return None
+        i1 = min(i0 + batch_size, n)
+        return quantizer.search(xq[i0:i1], nprobe)
+
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    qq = coarse_quant(0)
+
+    for i0 in range(0, n, batch_size):
+        i1 = min(i0 + batch_size, n)
+        qq_next = qq_pool.apply_async(coarse_quant, (i0 + batch_size, ))
+        Dq, Iq = qq
+        index.search_preassigned(
+            xq[i0:i1], k, Iq=Iq, Dq=Dq, I=I[i0:i1], D=D[i0:i1])
+        qq = qq_next.get()
+
+    qq_pool.close()
+    return D, I
+
+
+#################################################################
+# IVF index objects with a separate coarse quantizer
+#################################################################
+
+class SeparateCoarseQuantizationIndex:
+    """
+    Separately manage the coarse quantizer and the IVF index.
+    """
+
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.index = index
+        self.index_ivf = extract_index_ivf(index)
+        if isinstance(self.index_ivf, faiss.IndexIVF):
+            self.index_ivf.parallel_mode
+            self.index_ivf.parallel_mode = 3
+
+        self.quantizer = quantizer
+        assert self.quantizer.d == self.index_ivf.d
+        # populate quantizer if it was not done before
+        if quantizer.ntotal > 0:
+            assert quantizer.ntotal == self.index_ivf.nlist
+        else:
+            centroids = self.index_ivf.quantizer.reconstruct_n()
+            print(f"adding centroids size {centroids.shape} to quantizer")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+        self.bs = bs
+        self.seq_tiling = seq_tiling
+
+    def search(self, xq, k):
+        # perform coarse quantization
+        if isinstance(self.index, faiss.IndexPreTransform):
+            # print("applying pre-transform")
+            assert self.index.chain.size() == 1
+            xq = self.index.chain.at(0).apply(xq)
+        if self.bs <= 0:
+            # non batched
+            nprobe = self.index_ivf.nprobe
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            return self.index_ivf.search_preassigned(xq, k, Iq, Dq)
+        if self.seq_tiling:
+            return search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+        else:
+            return tiled_search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+
+
+class ShardedGPUIndex:
+    """
+    Multiple GPU indexes, each on its GPU, with a common coarse quantizer.
+    The Python version of IndexShardsIVF
+    """
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.quantizer = quantizer
+        self.cpu_index = index
+        if isinstance(index, faiss.IndexPreTransform):
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        self.pool = ThreadPool(ngpu)
+        self.bs = bs
+        if bs > 0:
+            self.q_pool = ThreadPool(1)
+
+    def __del__(self):
+        self.pool.close()
+        if self.bs > 0:
+            self.q_pool.close()
+
+    def search(self, xq, k):
+        nq = len(xq)
+        # perform coarse quantization
+        index = self.cpu_index
+        if isinstance(self.cpu_index, faiss.IndexPreTransform):
+            assert index.chain.size() == 1
+            xq = self.cpu_index.chain.at(0).apply(xq)
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        sub_index_0 = faiss.downcast_index(index.at(0))
+        nprobe = sub_index_0.nprobe
+
+        Dall = np.empty((ngpu, nq, k), dtype='float32')
+        Iall = np.empty((ngpu, nq, k), dtype='int64')
+        bs = self.bs
+        if bs <= 0:
+
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            def do_search(rank):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dall[rank], Iall[rank] = gpu_index.search_preassigned(
+                    xq, k, Iq, Dq)
+            list(self.pool.map(do_search, range(ngpu)))
+        else:
+            qq_pool = self.q_pool
+            bs = self.bs
+
+            def coarse_quant(i0):
+                if i0 >= nq:
+                    return None
+                return self.quantizer.search(xq[i0:i0 + bs], nprobe)
+
+            def do_search(rank, i0, qq):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dq, Iq = qq
+                Dall[rank, i0:i0 + bs], Iall[rank, i0:i0 + bs] = \
+                    gpu_index.search_preassigned(xq[i0:i0 + bs], k, Iq, Dq)
+
+            qq = coarse_quant(0)
+
+            for i0 in range(0, nq, bs):
+                qq_next = qq_pool.apply_async(coarse_quant, (i0 + bs, ))
+                list(self.pool.map(
+                    lambda rank: do_search(rank, i0, qq),
+                    range(ngpu)
+                ))
+                qq = qq_next.get()
+
+        return faiss.merge_knn_results(Dall, Iall)
+
+
+def extract_index_ivf(index):
+    """ extract the IVF sub-index from the index, supporting GpuIndexes
+    as well """
+    try:
+        return faiss.extract_index_ivf(index)
+    except RuntimeError:
+        if index.__class__ == faiss.IndexPreTransform:
+            index = faiss.downcast_index(index.index)
+        if isinstance(index, faiss.GpuIndexIVF):
+            return index
+        raise RuntimeError(f"could not extract IVF index from {index}")
+
+
+def set_index_parameter(index, name, val):
+    """
+    Index parameter setting that works on the index lookalikes defined above
+    """
+    if index.__class__ == SeparateCoarseQuantizationIndex:
+        if name == "nprobe":
+            set_index_parameter(index.index_ivf, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    if index.__class__ == ShardedGPUIndex:
+        if name == "nprobe":
+            set_index_parameter(index.cpu_index, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    # then it's a Faiss index
+    index = faiss.downcast_index(index)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        set_index_parameter(index.index, name, val)
+    elif isinstance(index, faiss.IndexShardsIVF):
+        if name != "nprobe" and name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            for i in range(index.count()):
+                sub_index = index.at(i)
+                set_index_parameter(sub_index, name, val)
+    elif (isinstance(index, faiss.IndexShards) or
+          isinstance(index, faiss.IndexReplicas)):
+        for i in range(index.count()):
+            sub_index = index.at(i)
+            set_index_parameter(sub_index, name, val)
+    elif name.startswith("quantizer_"):
+        index_ivf = extract_index_ivf(index)
+        set_index_parameter(
+            index_ivf.quantizer, name[name.find("_") + 1:], val)
+    elif name == "efSearch":
+        index.hnsw.efSearch
+        index.hnsw.efSearch = int(val)
+    elif name == "nprobe":
+        index_ivf = extract_index_ivf(index)
+        index_ivf.nprobe
+        index_ivf.nprobe = int(val)
+    else:
+        raise RuntimeError(f"could not set param {name} on {index}")
+
+
+#####################################################################
+# Driver routine
+#####################################################################
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('dataset options')
+    aa('--nq', type=int, default=int(10e5),
+       help="nb queries (queries will be duplicated if below that number")
+    aa('--db', default='bigann10M', help='dataset')
+
+    group = parser.add_argument_group('index options')
+    aa('--indexname', default="", help="override index name")
+    aa('--mmap', default=False, action='store_true', help='mmap index')
+    aa('--shard_type', default=1, type=int, help="set type of sharding")
+    aa('--useFloat16', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--useFloat16CoarseQuantizer', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--usePrecomputed', default=False, action='store_true',
+       help='GPU cloner options')
+    group = parser.add_argument_group('search options')
+    aa('--k', type=int, default=100)
+    aa('--search_type', default="cpu",
+        choices=[
+            "cpu", "gpu", "gpu_flat_quantizer",
+            "cpu_flat_gpu_quantizer", "gpu_tiled", "gpu_ivf_quantizer",
+            "multi_gpu", "multi_gpu_flat_quantizer",
+            "multi_gpu_sharded", "multi_gpu_flat_quantizer_sharded",
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf",
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf",
+        ],
+        help="how to search"
+    )
+    aa('--ivf_quant_nlist', type=int, default=1024,
+       help="nb of invlists for IVF quantizer")
+    aa('--batch_size', type=int, default=-1,
+       help="batch size for tiled CPU / GPU computation (-1= no tiling)")
+    aa('--n_autotune', type=int, default=300,
+        help="max nb of auto-tuning steps")
+    aa('--nt', type=int, default=-1, help="force number of CPU threads to this")
+
+    group = parser.add_argument_group('output options')
+    aa('--quiet', default=False, action="store_true")
+    aa('--stats', default="", help="pickle to store output stats")
+
+    args = parser.parse_args()
+    print("args:", args)
+
+    if not args.quiet:
+        # log some stats about the machine
+        os.system("grep -m1 'model name' < /proc/cpuinfo")
+        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
+        os.system("nvidia-smi")
+
+    print("prepare dataset", args.db)
+    ds = dataset_from_name(args.db)
+    print(ds)
+
+    print("Faiss nb GPUs:", faiss.get_num_gpus())
+
+    xq = ds.get_queries()
+    if args.nq > len(xq):
+        xqx = []
+        n = 0
+        while n < args.nq:
+            xqx.append(xq[:args.nq - n])
+            n += len(xqx[-1])
+        print(f"increased nb queries from {len(xq)} to {n}")
+        xq = np.vstack(xqx)
+
+    if args.nt != -1:
+        print("setting nb openmp threads to", args.nt)
+        faiss.omp_set_num_threads(args.nt)
+
+    print("loading index")
+
+    if args.mmap:
+        io_flag = faiss.IO_FLAG_READ_ONLY | faiss.IO_FLAG_MMAP
+    else:
+        io_flag = 0
+
+    print(f"load index {args.indexname} {io_flag=:x}")
+    index = faiss.read_index(args.indexname, io_flag)
+    index_ivf = faiss.extract_index_ivf(index)
+
+    print("prepare index")
+    op = OperatingPointsWithRanges()
+    op.add_range(
+        "nprobe", [
+            2 ** i for i in range(20)
+            if 2 ** i < index_ivf.nlist * 0.1 and 2 ** i <= 4096
+        ]
+    )
+
+    # prepare options for GPU clone
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = args.useFloat16
+    co.useFloat16CoarseQuantizer = args.useFloat16CoarseQuantizer
+    co.usePrecomputed = args.usePrecomputed
+    co.shard_type = args.shard_type
+
+    if args.search_type == "cpu":
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+    elif args.search_type == "gpu":
+        print("move index to 1 GPU")
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "gpu_tiled":
+        print("move index to 1 GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer_hnsw, index, bs=args.batch_size)
+    elif args.search_type == "gpu_ivf_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        centroids = index_ivf.quantizer.reconstruct_n()
+        replace_ivf_quantizer(index_ivf, faiss.IndexFlatL2(index_ivf.d))
+        res = faiss.StandardGpuResources()
+        new_quantizer = faiss.index_factory(
+            index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+        new_quantizer.train(centroids)
+        new_quantizer.add(centroids)
+        index = SeparateCoarseQuantizationIndex(
+            faiss.index_cpu_to_gpu(res, 0, new_quantizer, co),
+            faiss.index_cpu_to_gpu(res, 0, index, co),
+            bs=args.batch_size, seq_tiling=True
+        )
+        op.add_range(
+            "quantizer_nprobe",
+            [2 ** i for i in range(9)]
+        )
+        op.restrict_range("nprobe", 1025)
+    elif args.search_type == "gpu_flat_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "cpu_flat_gpu_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        quantizer = faiss.IndexFlatL2(index_ivf.d)
+        res = faiss.StandardGpuResources()
+        quantizer = faiss.index_cpu_to_gpu(res, 0, quantizer, co)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer, index, bs=args.batch_size)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in ("multi_gpu", "multi_gpu_sharded"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = "sharded" in args.search_type
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_flat_quantizer", "multi_gpu_flat_quantizer_sharded"):
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(ds.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        hnsw_quantizer = replace_ivf_quantizer(index_ivf, new_quantizer)
+        co.shard
+        co.shard = True
+        gpus = list(range(faiss.get_num_gpus()))
+        res = [faiss.StandardGpuResources() for _ in gpus]
+        index = faiss.index_cpu_to_gpu_multiple_py(res, index, co, gpus)
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_sharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = ShardedGPUIndex(hnsw_quantizer, index, bs=args.batch_size)
+        elif args.search_type == "multi_gpu_sharded1_ivf":
+            centroids = hnsw_quantizer.storage.reconstruct_n()
+            quantizer = faiss.index_factory(
+                centroids.shape[1], f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+            co.shard = False
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            op.restrict_range("nprobe", 1025)
+        elif args.search_type == "multi_gpu_sharded1_flat":
+            quantizer = hnsw_quantizer.storage
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+        else:
+            raise RuntimeError()
+    elif args.search_type in (
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = True
+        co.common_ivf_quantizer
+        co.common_ivf_quantizer = True
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_Csharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_flat":
+            new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_ivf":
+            quantizer = faiss.index_factory(
+                index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, quantizer)
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        else:
+            raise RuntimeError()
+    else:
+        raise RuntimeError()
+
+    totex = op.num_experiments()
+    experiments = op.sample_experiments()
+    print(f"total nb experiments {totex}, running {len(experiments)}")
+
+    print("perform search")
+    gt = ds.get_groundtruth(100)
+
+    # piggyback on operating points so that this gets stored in the stats file
+    op.all_experiments = []
+    op.platform = {
+        "loadavg": open("/proc/loadavg", "r").readlines(),
+        "procesor": [l for l in open("/proc/cpuinfo") if "model name" in l][0],
+        "GPU": list(os.popen("nvidia-smi", "r")),
+        "mem": open("/proc/meminfo", "r").readlines(),
+        "pid": os.getpid()
+    }
+    op.args = args
+    if args.stats:
+        print(f"storing stats in {args.stats} after each experiment")
+
+    for cno in experiments:
+        key = op.cno_to_key(cno)
+        parameters = op.get_parameters(key)
+        print(f"{cno=:4d} {str(parameters):50}", end=": ", flush=True)
+
+        (max_perf, min_time) = op.predict_bounds(key)
+        if not op.is_pareto_optimal(max_perf, min_time):
+            print(f"SKIP, {max_perf=:.3f} {min_time=:.3f}", )
+            continue
+
+        for name, val in parameters.items():
+            set_index_parameter(index, name, val)
+
+        if cno == 0:
+            # warmup
+            for _ in range(5):
+                D, I = index.search(xq, 100)
+
+        t0 = time.time()
+        try:
+            D, I = index.search(xq, 100)
+        except RuntimeError as e:
+            print(f"ERROR {e}")
+            continue
+        t1 = time.time()
+
+        recalls = {}
+        for rank in 1, 10, 100:
+            recall = (gt[:, :1] == I[:ds.nq, :rank]).sum() / ds.nq
+            recalls[rank] = recall
+
+        print(f"time={t1 - t0:.3f} s recalls={recalls}")
+        perf = recalls[1]
+        op.add_operating_point(key, perf, t1 - t0)
+        op.all_experiments.append({
+            "cno": cno,
+            "key": key,
+            "parameters": parameters,
+            "time": t1 - t0,
+            "recalls": recalls
+        })
+
+        if args.stats:
+            pickle.dump(op, open(args.stats, "wb"))
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_flat.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_flat.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import os
+import numpy as np
+import faiss
+
+from faiss.contrib.datasets import SyntheticDataset
+
+
+os.system("grep -m1 'model name' < /proc/cpuinfo")
+
+
+def format_tab(x):
+    return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
+
+
+faiss.cvar.distance_compute_min_k_reservoir = 5
+
+# for have_threads in True, False:
+for have_threads in False, :
+
+    if have_threads:
+        # good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
+        nthread = 32
+    else:
+        nthread = 1
+
+    faiss.omp_set_num_threads(nthread)
+    print("************ nthread=", nthread)
+
+    for nq in 100, 10000:
+
+        print("*********** nq=", nq)
+
+        if nq == 100:
+            nrun = 500
+            unit = "ms"
+        else:
+            nrun = 20
+            unit = "s"
+
+        restab = []
+        for d in 16, 32, 64, 128:
+
+            print("========== d=", d)
+
+            nb = 10000
+
+            # d = 32
+
+            ds = SyntheticDataset(d, 0, nb, nq)
+
+            print(ds)
+
+            index = faiss.IndexFlatL2(d)
+
+            index.add(ds.get_database())
+
+            nrun = 10
+            restab1 = []
+            restab.append(restab1)
+            for k in 1, 10, 100:
+                times = []
+                for run in range(nrun):
+                    t0 = time.time()
+                    index.search(ds.get_queries(), k)
+                    t1 = time.time()
+                    if run >= nrun // 5: # the rest is considered warmup
+                        times.append((t1 - t0))
+                times = np.array(times)
+
+                if unit == "ms":
+                    times *= 1000
+                    print("search k=%3d t=%.3f ms (± %.4f)" % (
+                        k, np.mean(times), np.std(times)))
+                else:
+                    print("search k=%3d t=%.3f s (± %.4f)" % (
+                        k, np.mean(times), np.std(times)))
+                restab1.append(np.mean(times))
+
+        print("restab=\n", format_tab(restab))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_pq.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_index_pq.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import faiss
+from datasets import load_sift1M, evaluate
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+k = 32
+
+for nbits in 4, 6, 8, 10, 12:
+    index = faiss.IndexPQ(d, 8, nbits)
+    index.train(xt)
+    index.add(xb)
+
+    t, r = evaluate(index, xq, gt, k)
+    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
+    del index
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import os
+import multiprocessing as mp
+import numpy as np
+import matplotlib.pyplot as plt
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+
+# ds = DatasetDeep1B(10**6)
+# ds = DatasetBigANN(nb_M=1)
+ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+k = 1
+AQ = faiss.AdditiveQuantizer
+
+
+def eval_recall(index, name):
+    t0 = time.time()
+    D, I = index.search(xq, k=k)
+    t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    corrects = (gt == I).sum()
+    recall = corrects / nq
+    print(
+        f'\tnprobe {index.nprobe:3d}, Recall@{k}: '
+        f'{recall:.6f}, speed: {speed:.6f} ms/query'
+    )
+
+    return recall, qps
+
+
+def eval_and_plot(name, rescale_norm=True, plot=True):
+    index = faiss.index_factory(d, name)
+    index_path = f"indices/{name}.faissindex"
+
+    if os.path.exists(index_path):
+        index = faiss.read_index(index_path)
+    else:
+        faiss.omp_set_num_threads(mp.cpu_count())
+        index.train(xt)
+        index.add(xb)
+        faiss.write_index(index, index_path)
+
+    # search params
+    if hasattr(index, 'rescale_norm'):
+        index.rescale_norm = rescale_norm
+        name += f"(rescale_norm={rescale_norm})"
+    faiss.omp_set_num_threads(1)
+
+    data = []
+    print(f"======{name}")
+    for nprobe in 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 128:
+        index.nprobe = nprobe
+        recall, qps = eval_recall(index, name)
+        data.append((recall, qps))
+
+    if plot:
+        data = np.array(data)
+        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
+
+
+M, nlist = 32, 1024
+
+# just for warmup...
+# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
+
+# benchmark
+plt.figure(figsize=(8, 6), dpi=80)
+
+# PQ
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs")
+eval_and_plot(f"IVF{nlist},PQ{M}x4fsr")
+
+# AQ, by_residual
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4")
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4")
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4", rescale_norm=False)
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4", rescale_norm=False)
+
+# AQ, no by_residual
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fs_Nlsq2x4")
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fs_Nrq2x4")
+
+plt.title("Indices on SIFT1M")
+plt.xlabel("Recall@1")
+plt.ylabel("QPS")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
+plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan_single_query.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_fastscan_single_query.py
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import os
+import multiprocessing as mp
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+# ds = DatasetDeep1B(10**6)
+ds = DatasetBigANN(nb_M=50)
+# ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+print('the dimension is {}, {}'.format(nb, d))
+
+k = 64
+
+
+def eval_recall(index, name, single_query=False):
+    t0 = time.time()
+    D, I = index.search(xq, k=k)
+
+    t = time.time() - t0
+    if single_query:
+        t0 = time.time()
+        for row in range(nq):
+            Ds, Is = index.search(xq[row:row + 1], k=k)
+            D[row, :] = Ds
+            I[row, :] = Is
+        t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    corrects = (gt[:, :1] == I[:, :k]).sum()
+    recall = corrects / nq
+    print(
+        f'\tnprobe {index.nprobe:3d}, 1Recall@{k}: '
+        f'{recall:.6f}, speed: {speed:.6f} ms/query'
+    )
+
+    return recall, qps
+
+
+def eval_and_plot(
+        name, rescale_norm=True, plot=True, single_query=False,
+        implem=None, num_threads=1):
+    index = faiss.index_factory(d, name)
+    index_path = f"indices/{name}.faissindex"
+
+    if os.path.exists(index_path):
+        index = faiss.read_index(index_path)
+    else:
+        faiss.omp_set_num_threads(mp.cpu_count())
+        index.train(xt)
+        index.add(xb)
+        faiss.write_index(index, index_path)
+
+    # search params
+    if hasattr(index, 'rescale_norm'):
+        index.rescale_norm = rescale_norm
+        name += f"(rescale_norm={rescale_norm})"
+    if implem is not None and hasattr(index, 'implem'):
+        index.implem = implem
+        name += f"(implem={implem})"
+    if single_query:
+        name += f"(single_query={single_query})"
+    if num_threads > 1:
+        name += f"(num_threads={num_threads})"
+
+    faiss.omp_set_num_threads(num_threads)
+
+    data = []
+    print(f"======{name}")
+    for nprobe in 1, 4, 8, 16, 32, 64, 128, 256:
+        index.nprobe = nprobe
+        recall, qps = eval_recall(index, name, single_query=single_query)
+        data.append((recall, qps))
+
+    if plot:
+        data = np.array(data)
+        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
+
+
+M, nlist = 64, 4096
+
+# just for warmup...
+# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
+
+# benchmark
+plt.figure(figsize=(8, 6), dpi=80)
+
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=0, num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=14, num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=15, num_threads=8)
+
+plt.title("Indices on Bigann50M")
+plt.xlabel("1Recall@{}".format(k))
+plt.ylabel("QPS")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
+plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_selector.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivf_selector.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+#include <unistd.h>
+#include <memory>
+
+#include <faiss/IVFlib.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+/************************
+ * This benchmark attempts to measure the runtime overhead to use an IDSelector
+ * over doing an unconditional sequential scan. Unfortunately the results of the
+ * benchmark also depend a lot on the parallel_mode and the way
+ * search_with_parameters works.
+ */
+
+int main() {
+    using idx_t = faiss::idx_t;
+    int d = 64;
+    size_t nb = 1024 * 1024;
+    size_t nq = 512 * 16;
+    size_t k = 10;
+    std::vector<float> data((nb + nq) * d);
+    float* xb = data.data();
+    float* xq = data.data() + nb * d;
+    faiss::rand_smooth_vectors(nb + nq, d, data.data(), 1234);
+
+    std::unique_ptr<faiss::Index> index;
+    // const char *index_key = "IVF1024,Flat";
+    const char* index_key = "IVF1024,SQ8";
+    printf("index_key=%s\n", index_key);
+    std::string stored_name =
+            std::string("/tmp/bench_ivf_selector_") + index_key + ".faissindex";
+
+    if (access(stored_name.c_str(), F_OK) != 0) {
+        printf("creating index\n");
+        index.reset(faiss::index_factory(d, index_key));
+
+        double t0 = faiss::getmillisecs();
+        index->train(nb, xb);
+        double t1 = faiss::getmillisecs();
+        index->add(nb, xb);
+        double t2 = faiss::getmillisecs();
+        printf("Write %s\n", stored_name.c_str());
+        faiss::write_index(index.get(), stored_name.c_str());
+    } else {
+        printf("Read %s\n", stored_name.c_str());
+        index.reset(faiss::read_index(stored_name.c_str()));
+    }
+    faiss::IndexIVF* index_ivf = static_cast<faiss::IndexIVF*>(index.get());
+    index->verbose = true;
+
+    for (int tt = 0; tt < 3; tt++) {
+        if (tt == 1) {
+            index_ivf->parallel_mode = 3;
+        } else {
+            index_ivf->parallel_mode = 0;
+        }
+
+        if (tt == 2) {
+            printf("set single thread\n");
+            omp_set_num_threads(1);
+        }
+        printf("parallel_mode=%d\n", index_ivf->parallel_mode);
+
+        std::vector<float> D1(nq * k);
+        std::vector<idx_t> I1(nq * k);
+        {
+            double t2 = faiss::getmillisecs();
+            index->search(nq, xq, k, D1.data(), I1.data());
+            double t3 = faiss::getmillisecs();
+
+            printf("search time, no selector: %.3f ms\n", t3 - t2);
+        }
+
+        std::vector<float> D2(nq * k);
+        std::vector<idx_t> I2(nq * k);
+        {
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+
+            faiss::ivflib::search_with_parameters(
+                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
+            double t3 = faiss::getmillisecs();
+            printf("search time with nullptr selector: %.3f ms\n", t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I2);
+        FAISS_THROW_IF_NOT(D1 == D2);
+
+        {
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+            faiss::IDSelectorAll sel;
+            params.sel = &sel;
+
+            faiss::ivflib::search_with_parameters(
+                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
+            double t3 = faiss::getmillisecs();
+            printf("search time with selector: %.3f ms\n", t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I2);
+        FAISS_THROW_IF_NOT(D1 == D2);
+
+        std::vector<float> D3(nq * k);
+        std::vector<idx_t> I3(nq * k);
+        {
+            int nt = omp_get_max_threads();
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+
+#pragma omp parallel for if (nt > 1)
+            for (idx_t slice = 0; slice < nt; slice++) {
+                idx_t i0 = nq * slice / nt;
+                idx_t i1 = nq * (slice + 1) / nt;
+                if (i1 > i0) {
+                    faiss::ivflib::search_with_parameters(
+                            index.get(),
+                            i1 - i0,
+                            xq + i0 * d,
+                            k,
+                            D3.data() + i0 * k,
+                            I3.data() + i0 * k,
+                            &params);
+                }
+            }
+            double t3 = faiss::getmillisecs();
+            printf("search time with null selector + manual parallel: %.3f ms\n",
+                   t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I3);
+        FAISS_THROW_IF_NOT(D1 == D3);
+    }
+
+    return 0;
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfflat_cuvs.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfflat_cuvs.py
@@ -0,0 +1,167 @@
+# @lint-ignore-every LICENSELINT
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+
+# ds = DatasetDeep1B(10**6)
+# ds = DatasetBigANN(nb_M=1)
+ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+
+aa('--bm_train', default=True,
+   help='whether to benchmark train operation on GPU index')
+aa('--bm_add', default=True,
+   help='whether to benchmark add operation on GPU index')
+aa('--bm_search', default=True,
+   help='whether to benchmark search operation on GPU index')
+
+
+group = parser.add_argument_group('IVF options')
+aa('--nlist', default=1024, type=int,
+    help="number of IVF centroids")
+
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=10, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=10, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+
+def bench_train_milliseconds(trainVecs, ncols, nlist, use_cuvs):
+    config = faiss.GpuIndexIVFFlatConfig()
+    config.use_cuvs = use_cuvs
+    index = faiss.GpuIndexIVFFlat(res, ncols, nlist, faiss.METRIC_L2, config)
+    t0 = time.time()
+    index.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+
+#warmup
+xw = rs.rand(nt, d)
+bench_train_milliseconds(xw, d, args.nlist, True)
+
+
+if args.bm_train:
+    print("=" * 40)
+    print("GPU Train Benchmarks")
+    print("=" * 40)
+
+    cuvs_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, True)
+    classical_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, False)
+    print("Method: IVFFlat, Operation: TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
+        d, args.nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
+
+
+def bench_add_milliseconds(addVecs, q, use_cuvs):
+    # construct a GPU index using the same trained coarse quantizer
+    config = faiss.GpuIndexIVFFlatConfig()
+    config.use_cuvs = use_cuvs
+    index_gpu = faiss.GpuIndexIVFFlat(res, q, d, args.nlist, faiss.METRIC_L2, config)
+    assert(index_gpu.is_trained)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_add:
+    print("=" * 40)
+    print("GPU Add Benchmarks")
+    print("=" * 40)
+    quantizer = faiss.IndexFlatL2(d)
+    idx_cpu = faiss.IndexIVFFlat(quantizer, d, args.nlist)
+    idx_cpu.train(xt)
+    cuvs_gpu_add_time = bench_add_milliseconds(xb, quantizer, True)
+    classical_gpu_add_time = bench_add_milliseconds(xb, quantizer, False)
+    print("Method: IVFFlat, Operation: ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
+        d, args.nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
+
+
+def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
+    co = faiss.GpuClonerOptions()
+    co.use_cuvs = use_cuvs
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    idx_cpu = faiss.IndexIVFFlat(
+            faiss.IndexFlatL2(d), d, args.nlist)
+    idx_cpu.train(xt)
+    idx_cpu.add(xb)
+
+    cuvs_gpu_search_time = bench_search_milliseconds(
+        idx_cpu, xq, args.nprobe, args.k, True)
+    classical_gpu_search_time = bench_search_milliseconds(
+        idx_cpu, xq, args.nprobe, args.k, False)
+    print("Method: IVFFlat, Operation: SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, cuVS enabled GPU search time: %.3f milliseconds" % (
+        d, args.nlist, nb, nq, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfpq_cuvs.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_ivfpq_cuvs.py
@@ -0,0 +1,187 @@
+# @lint-ignore-every LICENSELINT
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+import ctypes
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+
+# ds = DatasetDeep1B(10**6)
+# ds = DatasetBigANN(nb_M=1)
+ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+M = d / 2
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+
+aa('--bm_train', default=True,
+   help='whether to benchmark train operation on GPU index')
+aa('--bm_add', default=True,
+   help='whether to benchmark add operation on GPU index')
+aa('--bm_search', default=True,
+   help='whether to benchmark search operation on GPU index')
+
+
+group = parser.add_argument_group('IVF options')
+aa('--nlist', default=1024, type=np.int64,
+    help="number of IVF centroids")
+aa('--bits_per_code', default=8, type=np.int64, help='bits per code. Note that < 8 is only supported when cuVS is enabled')
+
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=10, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=10, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+gt = gt[:, :args.k]
+nlist = args.nlist
+bits_per_code = args.bits_per_code
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+
+def eval_recall(neighbors, t):
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    corrects = (gt == neighbors).sum()
+    recall = corrects / (nq * args.k)
+
+    return recall, qps
+
+
+def bench_train_milliseconds(trainVecs, use_cuvs):
+    config = faiss.GpuIndexIVFPQConfig()
+    config.use_cuvs = use_cuvs
+    index = faiss.GpuIndexIVFPQ(res, d, 1024, 32, 8, faiss.METRIC_L2, config)
+    t0 = time.time()
+    index.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+
+#warmup
+xw = rs.rand(nt, d)
+bench_train_milliseconds(xw, True)
+
+
+if args.bm_train:
+    print("=" * 40)
+    print("GPU Train Benchmarks")
+    print("=" * 40)
+
+    cuvs_gpu_train_time = bench_train_milliseconds(xt, True)
+    classical_gpu_train_time = bench_train_milliseconds(xt, False)
+    print("TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
+        d, nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
+
+
+def bench_add_milliseconds(addVecs, index_cpu, use_cuvs):
+    # construct a GPU index using the same trained coarse quantizer
+    config = faiss.GpuClonerOptions()
+    config.use_cuvs = use_cuvs
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu, config)
+    assert(index_gpu.is_trained)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_add:
+    print("=" * 40)
+    print("GPU Add Benchmarks")
+    print("=" * 40)
+    quantizer = faiss.IndexFlatL2(d)
+    index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
+    index_cpu.train(xt)
+    cuvs_gpu_add_time = bench_add_milliseconds(xb, index_cpu, True)
+    classical_gpu_add_time = bench_add_milliseconds(xb, index_cpu, False)
+    print("ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
+        d, nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
+
+
+def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
+    co = faiss.GpuClonerOptions()
+    co.use_cuvs = use_cuvs
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    _, I = index_gpu.search(queryVecs, k)
+    return I, 1000*(time.time() - t0)
+
+
+# Search benchmarks: both indexes have identical IVF centroids and lists. 
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
+    index_cpu.train(xt)
+    index_cpu.add(xb)
+
+    cuvs_indices, cuvs_gpu_search_time = bench_search_milliseconds(
+        index_cpu, xq, args.nprobe, args.k, True)
+    classical_gpu_indices, classical_gpu_search_time = bench_search_milliseconds(
+        index_cpu, xq, args.nprobe, args.k, False)
+    cuvs_recall, cuvs_qps = eval_recall(cuvs_indices, cuvs_gpu_search_time)
+    classical_recall, classical_qps = eval_recall(classical_gpu_indices, classical_gpu_search_time)
+    print("SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU qps: %.3f, cuVS enabled GPU qps: %.3f"  % (
+        d, nlist, nb, nq, args.nprobe, args.k, classical_qps, cuvs_qps))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pairwise_distances.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pairwise_distances.py
@@ -0,0 +1,35 @@
+#! /usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""small test script to benchmark the SIMD implementation of the
+distance computations for the additional metrics. Call eg. with L1 to
+get L1 distance computations.
+"""
+
+import faiss
+
+import sys
+import time
+
+d = 64
+nq = 4096
+nb = 16384
+
+print("sample")
+
+xq = faiss.randn((nq, d), 123)
+xb = faiss.randn((nb, d), 123)
+
+mt_name = "L2" if len(sys.argv) < 2 else sys.argv[1]
+
+mt = getattr(faiss, "METRIC_" + mt_name)
+
+print("distances")
+t0 = time.time()
+dis = faiss.pairwise_distances(xq, xb, mt)
+t1 = time.time()
+
+print("nq=%d nb=%d d=%d %s: %.3f s" % (nq, nb, d, mt_name, t1 - t0))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_partition.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_partition.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import faiss
+import numpy as np
+
+def do_partition(n, qin, maxval=65536, seed=123, id_type='int64'):
+    print(
+        f"n={n} qin={qin} maxval={maxval} id_type={id_type}  ",
+        end="\t", flush=True
+    )
+
+    # print("seed=", seed)
+    rs = np.random.RandomState(seed)
+    vals = rs.randint(maxval, size=n).astype('uint16')
+    ids = (rs.permutation(n) + 12345).astype(id_type)
+
+    sp = faiss.swig_ptr
+
+    tab_a = faiss.AlignedTableUint16()
+    faiss.copy_array_to_AlignedTable(vals, tab_a)
+
+    nrun = 2000
+
+    times = []
+    nerr = 0
+    stats = faiss.cvar.partition_stats
+    stats.reset()
+    for _run in range(nrun):
+        faiss.copy_array_to_AlignedTable(vals, tab_a)
+        t0 = time.time()
+        # print("tab a type", tab_a.get())
+        if type(qin) == int:
+            q = qin
+            faiss.CMax_uint16_partition_fuzzy(
+                tab_a.get(), sp(ids), n, q, q, None)
+        else:
+            q_min, q_max = qin
+            q = np.array([-1], dtype='uint64')
+            faiss.CMax_uint16_partition_fuzzy(
+                tab_a.get(), sp(ids), n,
+                q_min, q_max, sp(q)
+            )
+            q = q[0]
+
+            if not (q_min <= q <= q_max):
+                nerr += 1
+
+        t1 = time.time()
+
+        times.append(t1 - t0)
+
+    times = np.array(times[100:]) * 1000000
+
+
+    print(
+        f"times {times.mean():.3f} µs (± {times.std():.4f} µs) nerr={nerr} "
+        f"bissect {stats.bissect_cycles / 1e6:.3f} Mcy "
+        f"compress {stats.compress_cycles / 1e6:.3f} Mcy"
+    )
+
+do_partition(200, (100, 100))
+do_partition(200, (100, 150))
+do_partition(2000, (1000, 1000))
+do_partition(2000, (1000, 1500))
+do_partition(20000, (10000, 10000))
+do_partition(20000, (10000, 15000))
+
+
+do_partition(200, (100, 100), id_type='int32')
+do_partition(200, (100, 150), id_type='int32')
+do_partition(2000, (1000, 1000), id_type='int32')
+do_partition(2000, (1000, 1500), id_type='int32')
+do_partition(20000, (10000, 10000), id_type='int32')
+do_partition(20000, (10000, 15000), id_type='int32')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_1bn.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_1bn.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import sys
+import time
+import numpy as np
+import re
+import faiss
+from multiprocessing.pool import ThreadPool
+from datasets import ivecs_read
+
+
+# we mem-map the biggest files to avoid having them in memory all at
+# once
+
+
+def mmap_fvecs(fname):
+    x = np.memmap(fname, dtype='int32', mode='r')
+    d = x[0]
+    return x.view('float32').reshape(-1, d + 1)[:, 1:]
+
+
+def mmap_bvecs(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+#################################################################
+# Bookkeeping
+#################################################################
+
+
+dbname        = sys.argv[1]
+index_key     = sys.argv[2]
+parametersets = sys.argv[3:]
+
+
+tmpdir = '/tmp/bench_polysemous'
+
+if not os.path.isdir(tmpdir):
+    print("%s does not exist, creating it" % tmpdir)
+    os.mkdir(tmpdir)
+
+
+#################################################################
+# Prepare dataset
+#################################################################
+
+
+print("Preparing dataset", dbname)
+
+if dbname.startswith('SIFT'):
+    # SIFT1M to SIFT1000M
+    dbsize = int(dbname[4:-1])
+    xb = mmap_bvecs('bigann/bigann_base.bvecs')
+    xq = mmap_bvecs('bigann/bigann_query.bvecs')
+    xt = mmap_bvecs('bigann/bigann_learn.bvecs')
+
+    # trim xb to correct size
+    xb = xb[:dbsize * 1000 * 1000]
+
+    gt = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
+
+elif dbname == 'Deep1B':
+    xb = mmap_fvecs('deep1b/base.fvecs')
+    xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
+    xt = mmap_fvecs('deep1b/learn.fvecs')
+    # deep1B's train is is outrageously big
+    xt = xt[:10 * 1000 * 1000]
+    gt = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
+
+else:
+    print('unknown dataset', dbname, file=sys.stderr)
+    sys.exit(1)
+
+
+print("sizes: B %s Q %s T %s gt %s" % (
+    xb.shape, xq.shape, xt.shape, gt.shape))
+
+nq, d = xq.shape
+nb, d = xb.shape
+assert gt.shape[0] == nq
+
+
+#################################################################
+# Training
+#################################################################
+
+
+def choose_train_size(index_key):
+
+    # some training vectors for PQ and the PCA
+    n_train = 256 * 1000
+
+    if "IVF" in index_key:
+        matches = re.findall('IVF([0-9]+)', index_key)
+        ncentroids = int(matches[0])
+        n_train = max(n_train, 100 * ncentroids)
+    elif "IMI" in index_key:
+        matches = re.findall('IMI2x([0-9]+)', index_key)
+        nbit = int(matches[0])
+        n_train = max(n_train, 256 * (1 << nbit))
+    return n_train
+
+
+def get_trained_index():
+    filename = "%s/%s_%s_trained.index" % (
+        tmpdir, dbname, index_key)
+
+    if not os.path.exists(filename):
+        index = faiss.index_factory(d, index_key)
+
+        n_train = choose_train_size(index_key)
+
+        xtsub = xt[:n_train]
+        print("Keeping %d train vectors" % xtsub.shape[0])
+        # make sure the data is actually in RAM and in float
+        xtsub = xtsub.astype('float32').copy()
+        index.verbose = True
+
+        t0 = time.time()
+        index.train(xtsub)
+        index.verbose = False
+        print("train done in %.3f s" % (time.time() - t0))
+        print("storing", filename)
+        faiss.write_index(index, filename)
+    else:
+        print("loading", filename)
+        index = faiss.read_index(filename)
+    return index
+
+
+#################################################################
+# Adding vectors to dataset
+#################################################################
+
+def rate_limited_imap(f, l):
+    'a thread pre-processes the next element'
+    pool = ThreadPool(1)
+    res = None
+    for i in l:
+        res_next = pool.apply_async(f, (i, ))
+        if res:
+            yield res.get()
+        res = res_next
+    yield res.get()
+
+
+def matrix_slice_iterator(x, bs):
+    " iterate over the lines of x in blocks of size bs"
+    nb = x.shape[0]
+    block_ranges = [(i0, min(nb, i0 + bs))
+                    for i0 in range(0, nb, bs)]
+
+    return rate_limited_imap(
+        lambda i01: x[i01[0]:i01[1]].astype('float32').copy(),
+        block_ranges)
+
+
+def get_populated_index():
+
+    filename = "%s/%s_%s_populated.index" % (
+        tmpdir, dbname, index_key)
+
+    if not os.path.exists(filename):
+        index = get_trained_index()
+        i0 = 0
+        t0 = time.time()
+        for xs in matrix_slice_iterator(xb, 100000):
+            i1 = i0 + xs.shape[0]
+            print('\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0), end=' ')
+            sys.stdout.flush()
+            index.add(xs)
+            i0 = i1
+        print()
+        print("Add done in %.3f s" % (time.time() - t0))
+        print("storing", filename)
+        faiss.write_index(index, filename)
+    else:
+        print("loading", filename)
+        index = faiss.read_index(filename)
+    return index
+
+
+#################################################################
+# Perform searches
+#################################################################
+
+index = get_populated_index()
+
+ps = faiss.ParameterSpace()
+ps.initialize(index)
+
+# make sure queries are in RAM
+xq = xq.astype('float32').copy()
+
+# a static C++ object that collects statistics about searches
+ivfpq_stats = faiss.cvar.indexIVFPQ_stats
+ivf_stats = faiss.cvar.indexIVF_stats
+
+
+if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
+
+    if parametersets == ['autotune']:
+        faiss.omp_set_num_threads(1)
+
+    # setup the Criterion object: optimize for 1-R@1
+    crit = faiss.OneRecallAtRCriterion(nq, 1)
+    # by default, the criterion will request only 1 NN
+    crit.nnn = 100
+    crit.set_groundtruth(None, gt.astype('int64'))
+
+    # then we let Faiss find the optimal parameters by itself
+    print("exploring operating points")
+
+    t0 = time.time()
+    op = ps.explore(index, xq, crit)
+    print("Done in %.3f s, available OPs:" % (time.time() - t0))
+
+    # opv is a C++ vector, so it cannot be accessed like a Python array
+    opv = op.optimal_pts
+    print("%-40s  1-R@1     time" % "Parameters")
+    for i in range(opv.size()):
+        opt = opv.at(i)
+        print("%-40s  %.4f  %7.3f" % (opt.key, opt.perf, opt.t))
+
+else:
+
+    # we do queries in a single thread
+    faiss.omp_set_num_threads(1)
+
+    print(' ' * len(parametersets[0]), '\t', 'R@1    R@10   R@100     time    %pass')
+
+    for param in parametersets:
+        print(param, '\t', end=' ')
+        sys.stdout.flush()
+        ps.set_index_parameters(index, param)
+        t0 = time.time()
+        ivfpq_stats.reset()
+        ivf_stats.reset()
+        D, I = index.search(xq, 100)
+        t1 = time.time()
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
+        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_sift1m.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_polysemous_sift1m.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import time
+import numpy as np
+
+import faiss
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+# index with 16 subquantizers, 8 bit each
+index = faiss.IndexPQ(d, 16, 8)
+index.do_polysemous_training = True
+index.verbose = True
+
+print("train")
+
+index.train(xt)
+
+print("add vectors to index")
+
+index.add(xb)
+
+nt = 1
+faiss.omp_set_num_threads(1)
+
+
+print("PQ baseline", end=' ')
+index.search_type = faiss.IndexPQ.ST_PQ
+t, r = evaluate(index, xq, gt, 1)
+print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
+
+for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
+    print("Polysemous", ht, end=' ')
+    index.search_type = faiss.IndexPQ.ST_polysemous
+    index.polysemous_ht = ht
+    t, r = evaluate(index, xq, gt, 1)
+    print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_tables.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_tables.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import os
+import numpy as np
+import faiss
+
+os.system("grep -m1 'model name' < /proc/cpuinfo")
+
+def format_tab(x):
+    return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
+
+
+def run_bench(d, dsub, nbit=8, metric=None):
+
+    M = d // dsub
+    pq = faiss.ProductQuantizer(d, M, nbit)
+    pq.train(faiss.randn((max(1000, pq.ksub * 50), d), 123))
+
+
+    sp = faiss.swig_ptr
+
+    times = []
+    nrun = 100
+
+    print(f"d={d} dsub={dsub} ksub={pq.ksub}", end="\t")
+    res = []
+    for nx in 1, 10, 100:
+        x = faiss.randn((nx, d), 555)
+
+        times = []
+        for run in range(nrun):
+            t0 = time.time()
+            new_tab = np.zeros((nx, M, pq.ksub), "float32")
+            if metric == faiss.METRIC_INNER_PRODUCT:
+                pq.compute_inner_prod_tables(nx, sp(x), sp(new_tab))
+            elif metric == faiss.METRIC_L2:
+                pq.compute_distance_tables(nx, sp(x), sp(new_tab))
+            else:
+                assert False
+            t1 = time.time()
+            if run >= nrun // 5: # the rest is considered warmup
+                times.append((t1 - t0))
+        times = np.array(times) * 1000
+
+        print(f"nx={nx}: {np.mean(times):.3f} ms (± {np.std(times):.4f})",
+               end="\t")
+        res.append(times.mean())
+    print()
+    return res
+
+# for have_threads in True, False:
+for have_threads in False, True:
+
+    if have_threads:
+        # good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
+        nthread = 32
+    else:
+        nthread = 1
+
+    faiss.omp_set_num_threads(nthread)
+
+    for metric in faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2:
+        print("============= nthread=", nthread, "metric=", metric)
+        allres = []
+        for dsub in 2, 4, 8:
+            for nbit in 4, 8:
+                for M in 8, 20:
+                    res = run_bench(M * dsub, dsub, nbit, metric)
+                    allres.append(res)
+        allres = np.array(allres)
+        print("formated result:")
+        print(format_tab(allres))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_transposed_centroid_table.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_pq_transposed_centroid_table.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import random
+
+import faiss.contrib.datasets
+
+
+# copied from benchs/bench_all_ivf/bench_all_ivf.py
+def unwind_index_ivf(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        assert index.chain.size() == 1
+        vt = index.chain.at(0)
+        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
+        assert vt2 is None
+        return index_ivf, vt
+    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
+        return unwind_index_ivf(faiss.downcast_index(index.base_index))
+    if isinstance(index, faiss.IndexIVF):
+        return index, None
+    else:
+        return None, None
+
+
+def test_bigann10m(index_file, index_parameters):
+    ds = faiss.contrib.datasets.DatasetBigANN(nb_M=10)
+
+    xq = ds.get_queries()
+    xb = ds.get_database()
+    gt = ds.get_groundtruth()
+
+    nb, d = xb.shape
+    nq, d = xq.shape
+
+    print("Reading index {}".format(index_file))
+    index = faiss.read_index(index_file)
+
+    ps = faiss.ParameterSpace()
+    ps.initialize(index)
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    print('params                                                                      regular    transp_centroids   regular   R@1    R@10   R@100')
+    for index_parameter in index_parameters:
+        ps.set_index_parameters(index, index_parameter)
+
+        print(index_parameter.ljust(70), end=' ')
+
+        k = 100
+
+        # warmup
+        D, I = index.search(xq, k)
+
+        # warmup
+        D, I = index.search(xq, k)
+
+        # eval
+        t2_0 = time.time()
+        D, I = index.search(xq, k)
+        t2_1 = time.time()
+
+        # eval
+        index_ivf.pq.sync_transposed_centroids()
+        t3_0 = time.time()
+        D, I = index.search(xq, k)
+        t3_1 = time.time()
+
+        # eval
+        index_ivf.pq.clear_transposed_centroids()
+        t4_0 = time.time()
+        D, I = index.search(xq, k)
+        t4_1 = time.time()
+
+        print("   %9.5f  " % (t2_1 - t2_0), end=' ')
+        print("   %9.5f  " % (t3_1 - t3_0), end=' ')
+        print("   %9.5f  " % (t4_1 - t4_0), end=' ')
+
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print()
+
+
+if __name__ == "__main__":
+    faiss.contrib.datasets.dataset_basedir = '/home/aguzhva/ANN_SIFT1B/'
+
+    # represents OPQ32_128,IVF65536_HNSW32,PQ32 index
+    index_file_1 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/hnsw32/.faissindex"
+
+    nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+    quantizer_efsearch_values = [4, 8, 16, 32, 64, 128, 256, 512]
+    ht_values = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 256]
+
+    # represents OPQ32_128,IVF65536(IVF256,PQHDx4fs,RFlat),PQ32 index
+    index_file_2 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/pq4/.faissindex"
+
+    quantizer_k_factor_rf_values = [1, 2, 4, 8, 16, 32, 64]
+    quantizer_nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128]
+
+    # test the first index
+    index_parameters_1 = []
+    for _ in range(0, 20):
+        nprobe = random.choice(nprobe_values)
+        quantizer_efsearch = random.choice(quantizer_efsearch_values)
+        ht = random.choice(ht_values)
+        index_parameters_1.append(
+            "nprobe={},quantizer_efSearch={},ht={}".format(
+                nprobe,
+                quantizer_efsearch,
+                ht)
+        )
+
+    test_bigann10m(index_file_1, index_parameters_1)
+
+    # test the second index
+    index_parameters_2 = []
+    for _ in range(0, 20):
+        nprobe = random.choice(nprobe_values)
+        quantizer_k_factor_rf = random.choice(quantizer_k_factor_rf_values)
+        quantizer_nprobe = random.choice(quantizer_nprobe_values)
+        ht = random.choice(ht_values)
+        index_parameters_2.append(
+            "nprobe={},quantizer_k_factor_rf={},quantizer_nprobe={},ht={}".format(
+                nprobe,
+                quantizer_k_factor_rf,
+                quantizer_nprobe,
+                ht)
+        )
+
+    test_bigann10m(index_file_2, index_parameters_2)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_quantizer.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_quantizer.py
@@ -0,0 +1,157 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+import faiss
+import time
+import numpy as np
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+
+def eval_codec(q, xq, xb, gt):
+    t0 = time.time()
+    codes = q.compute_codes(xb)
+    t1 = time.time()
+    xb_decoded = q.decode(codes)
+    recons_err = ((xb - xb_decoded) ** 2).sum() / xb.shape[0]
+    # for compatibility with the codec benchmarks
+    err_compat = np.linalg.norm(xb - xb_decoded, axis=1).mean()
+    xq_decoded = q.decode(q.compute_codes(xq))
+    D, I = faiss.knn(xq_decoded, xb_decoded, 1)
+    recall = (I[:, 0] == gt[:, 0]).sum() / nq
+    print(
+        f"\tencode time: {t1 - t0:.3f} reconstruction error: {recons_err:.3f} "
+        f"1-recall@1: {recall:.4f} recons_err_compat {err_compat:.3f}")
+
+
+def eval_quantizer(q, xq, xb, gt, xt, variants=None):
+    if variants is None:
+        variants = [(None, None)]
+    t0 = time.time()
+    q.train(xt)
+    t1 = time.time()
+    train_t = t1 - t0
+    print(f'\ttraining time: {train_t:.3f} s')
+    for name, val in variants:
+        if name is not None:
+            print(f"{name}={val}")
+
+            if isinstance(q, faiss.ProductAdditiveQuantizer):
+                for i in range(q.nsplits):
+                    subq = faiss.downcast_Quantizer(q.subquantizer(i))
+                    getattr(subq, name)
+                    setattr(subq, name, val)
+            else:
+                getattr(q, name)  # make sure field exists
+                setattr(q, name, val)
+
+        eval_codec(q, xq, xb, gt)
+
+
+todo = sys.argv[1:]
+
+if len(todo) > 0 and "deep1M" in todo[0]:
+    ds = DatasetDeep1B(10**6)
+    del todo[0]
+elif len(todo) > 0 and "bigann1M" in todo[0]:
+    ds = DatasetBigANN(nb_M=1)
+    del todo[0]
+else:
+    ds = DatasetSIFT1M()
+
+if len(todo) > 0:
+    if todo[0].count("x") == 1:
+        M, nbits = [int(x) for x in todo[0].split("x")]
+        del todo[0]
+    elif todo[0].count("x") == 2:
+        nsplits, Msub, nbits = [int(x) for x in todo[0].split("x")]
+        M = nsplits * Msub
+        del todo[0]
+
+maxtrain = max(100 << nbits, 10**5)
+print(f"eval on {M}x{nbits} maxtrain={maxtrain}")
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train(maxtrain=maxtrain)
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+
+# fastest to slowest
+
+if 'lsq-gpu' in todo:
+    lsq = faiss.LocalSearchQuantizer(d, M, nbits)
+    ngpus = faiss.get_num_gpus()
+    lsq.icm_encoder_factory = faiss.GpuIcmEncoderFactory(ngpus)
+    lsq.verbose = True
+    eval_quantizer(lsq, xb, xt, 'lsq-gpu')
+
+if 'pq' in todo:
+    pq = faiss.ProductQuantizer(d, M, nbits)
+    print("===== PQ")
+    eval_quantizer(pq, xq, xb, gt, xt)
+
+if 'opq' in todo:
+    d2 = ((d + M - 1) // M) * M
+    print("OPQ d2=", d2)
+    opq = faiss.OPQMatrix(d, M, d2)
+    opq.train(xt)
+    xq2 = opq.apply(xq)
+    xb2 = opq.apply(xb)
+    xt2 = opq.apply(xt)
+    pq = faiss.ProductQuantizer(d2, M, nbits)
+    print("===== PQ")
+    eval_quantizer(pq, xq2, xb2, gt, xt2)
+
+if 'prq' in todo:
+    print(f"===== PRQ{nsplits}x{Msub}x{nbits}")
+    prq = faiss.ProductResidualQuantizer(d, nsplits, Msub, nbits)
+    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
+    eval_quantizer(prq, xq, xb, gt, xt, variants=variants)
+
+if 'plsq' in todo:
+    print(f"===== PLSQ{nsplits}x{Msub}x{nbits}")
+    plsq = faiss.ProductLocalSearchQuantizer(d, nsplits, Msub, nbits)
+    variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
+    eval_quantizer(plsq, xq, xb, gt, xt, variants=variants)
+
+if 'rq' in todo:
+    print("===== RQ")
+    rq = faiss.ResidualQuantizer(d, M, nbits, )
+    rq.max_beam_size
+    rq.max_beam_size = 30   # for compatibility with older runs
+    # rq.train_type = faiss.ResidualQuantizer.Train_default
+    # rq.verbose = True
+    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
+    eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
+
+if 'rq_lut' in todo:
+    print("===== RQ")
+    rq = faiss.ResidualQuantizer(d, M, nbits, )
+    rq.max_beam_size
+    rq.max_beam_size = 30   # for compatibility with older runs
+    rq.use_beam_LUT
+    rq.use_beam_LUT = 1
+    # rq.train_type = faiss.ResidualQuantizer.Train_default
+    # rq.verbose = True
+    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32, 64)]
+    eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
+
+if 'lsq' in todo:
+    print("===== LSQ")
+    lsq = faiss.LocalSearchQuantizer(d, M, nbits)
+    variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
+    eval_quantizer(lsq, xq, xb, gt, xt, variants=variants)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_scalar_quantizer.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_scalar_quantizer.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import numpy as np
+import faiss
+from datasets import load_sift1M
+
+
+print("load data")
+
+xb, xq, xt, gt = load_sift1M()
+nq, d = xq.shape
+
+ncent = 256
+
+variants = [(name, getattr(faiss.ScalarQuantizer, name))
+            for name in dir(faiss.ScalarQuantizer)
+            if name.startswith('QT_')]
+
+quantizer = faiss.IndexFlatL2(d)
+# quantizer.add(np.zeros((1, d), dtype='float32'))
+
+if False:
+    for name, qtype in [('flat', 0)] + variants:
+
+        print("============== test", name)
+        t0 = time.time()
+
+        if name == 'flat':
+            index = faiss.IndexIVFFlat(quantizer, d, ncent,
+                                       faiss.METRIC_L2)
+        else:
+            index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
+                                                  qtype, faiss.METRIC_L2)
+
+        index.nprobe = 16
+        print("[%.3f s] train" % (time.time() - t0))
+        index.train(xt)
+        print("[%.3f s] add" % (time.time() - t0))
+        index.add(xb)
+        print("[%.3f s] search" % (time.time() - t0))
+        D, I = index.search(xq, 100)
+        print("[%.3f s] eval" % (time.time() - t0))
+
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print()
+
+if True:
+    for name, qtype in variants:
+
+        print("============== test", name)
+
+        for rsname, vals in [('RS_minmax',
+                              [-0.4, -0.2, -0.1, -0.05, 0.0, 0.1, 0.5]),
+                             ('RS_meanstd', [0.8, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0]),
+                             ('RS_quantiles', [0.02, 0.05, 0.1, 0.15]),
+                             ('RS_optim', [0.0])]:
+            for val in vals:
+                print("%-15s %5g    " % (rsname, val), end=' ')
+                index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
+                                                      qtype, faiss.METRIC_L2)
+                index.nprobe = 16
+                index.sq.rangestat = getattr(faiss.ScalarQuantizer,
+                                          rsname)
+
+                index.rangestat_arg = val
+
+                index.train(xt)
+                index.add(xb)
+                t0 = time.time()
+                D, I = index.search(xq, 100)
+                t1 = time.time()
+
+                for rank in 1, 10, 100:
+                    n_ok = (I[:, :rank] == gt[:, :1]).sum()
+                    print("%.4f" % (n_ok / float(nq)), end=' ')
+                print("   %.3f s" % (t1 - t0))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_vector_ops.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_vector_ops.py
@@ -0,0 +1,84 @@
+#! /usr/bin/env python2
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import numpy as np
+import faiss
+import time
+
+swig_ptr = faiss.swig_ptr
+
+if False:
+    a = np.arange(10, 14).astype('float32')
+    b = np.arange(20, 24).astype('float32')
+
+    faiss.fvec_inner_product (swig_ptr(a), swig_ptr(b), 4)
+
+    1/0
+
+xd = 100
+yd = 1000000
+
+np.random.seed(1234)
+
+faiss.omp_set_num_threads(1)
+
+print('xd=%d yd=%d' % (xd, yd))
+
+print('Running inner products test..')
+for d in 3, 4, 12, 36, 64:
+
+    x = faiss.rand(xd * d).reshape(xd, d)
+    y = faiss.rand(yd * d).reshape(yd, d)
+
+    distances = np.empty((xd, yd), dtype='float32')
+
+    t0 = time.time()
+    for i in range(xd):
+        faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
+                                     swig_ptr(x[i]),
+                                     swig_ptr(y),
+                                     d, yd)
+    t1 = time.time()
+
+    # sparse verification
+    ntry = 100
+    num, denom = 0, 0
+    for t in range(ntry):
+        xi = np.random.randint(xd)
+        yi = np.random.randint(yd)
+        num += abs(distances[xi, yi] - np.dot(x[xi], y[yi]))
+        denom += abs(distances[xi, yi])
+
+    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
+
+
+print('Running L2sqr test..')
+for d in 3, 4, 12, 36, 64:
+
+    x = faiss.rand(xd * d).reshape(xd, d)
+    y = faiss.rand(yd * d).reshape(yd, d)
+
+    distances = np.empty((xd, yd), dtype='float32')
+
+    t0 = time.time()
+    for i in range(xd):
+        faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
+                            swig_ptr(x[i]),
+                            swig_ptr(y),
+                            d, yd)
+    t1 = time.time()
+
+    # sparse verification
+    ntry = 100
+    num, denom = 0, 0
+    for t in range(ntry):
+        xi = np.random.randint(xd)
+        yi = np.random.randint(yd)
+        num += abs(distances[xi, yi] - np.sum((x[xi] - y[yi]) ** 2))
+        denom += abs(distances[xi, yi])
+
+    print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/datasets.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/datasets.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import sys
+import time
+import numpy as np
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def load_sift1M():
+    print("Loading sift1M...", end='', file=sys.stderr)
+    xt = fvecs_read("sift1M/sift_learn.fvecs")
+    xb = fvecs_read("sift1M/sift_base.fvecs")
+    xq = fvecs_read("sift1M/sift_query.fvecs")
+    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+    print("done", file=sys.stderr)
+
+    return xb, xq, xt, gt
+
+
+def evaluate(index, xq, gt, k):
+    nq = xq.shape[0]
+    t0 = time.time()
+    D, I = index.search(xq, k)  # noqa: E741
+    t1 = time.time()
+
+    recalls = {}
+    i = 1
+    while i <= k:
+        recalls[i] = (I[:, :i] == gt[:, :1]).sum() / float(nq)
+        i *= 10
+
+    return (t1 - t0) * 1000.0 / nq, recalls
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/README.md
@@ -0,0 +1,194 @@
+# Distributed on-disk index for 1T-scale datasets
+
+This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
+All the code is in python 3 (and not compatible with Python 2).
+The current code uses the Deep1B dataset for demonstration purposes, but can scale to 1000x larger.
+To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts.
+
+The cluster commands are written for the Slurm batch scheduling system.
+Hopefully, changing to another type of scheduler should be quite straightforward.
+
+## Distributed k-means
+
+To cluster 500M vectors to 10M centroids, it is useful to have a distributed k-means implementation.
+The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment.
+The master/client then synthesizes the results and updates the centroids.
+
+The distributed k-means implementation here is based on 3 files:
+
+- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation.
+The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient.
+It relies on a `DatasetAssign` object that does the assignment to centroids, which is the bulk of the computation.
+The object can be a Faiss CPU index, a GPU index or a set of remote GPU or CPU indexes.
+
+- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster.
+
+The distributed k-means works with a Python install that contains faiss and scipy (for sparse matrices).
+It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set.
+The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem.
+The file is also assumed to be accessible from all server machines with eg. a distributed file system.
+
+### Local tests
+
+Edit `distributed_kmeans.py` to point `testdata` to your local copy of the dataset.
+
+Then, 4 levels of sanity check can be run:
+```bash
+# reference Faiss C++ run
+python distributed_kmeans.py --test 0
+# using the Python implementation
+python distributed_kmeans.py --test 1
+# use the dispatch object (on local datasets)
+python distributed_kmeans.py --test 2
+# same, with GPUs
+python distributed_kmeans.py --test 3
+```
+The output should look like [This gist](https://gist.github.com/mdouze/ffa01fe666a9325761266fe55ead72ad).
+
+### Distributed sanity check
+
+To run the distributed k-means, `distributed_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option).
+Edit the top of `run_on_cluster.bash` to set the path of the data to cluster.
+
+Sanity checks can be run with
+```bash
+# non distributed baseline
+bash run_on_cluster.bash test_kmeans_0
+# using all the machine's GPUs
+bash run_on_cluster.bash test_kmeans_1
+# distributed run, with one local server per GPU
+bash run_on_cluster.bash test_kmeans_2
+```
+The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol.
+The output should look like [this gist](https://gist.github.com/mdouze/5b2dc69b74579ecff04e1686a277d32e).
+
+
+
+### Distributed run
+
+The way the script can be distributed depends on the cluster's scheduling system.
+Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of machines and start the same executable on all of them.
+
+The command
+```bash
+bash run_on_cluster.bash slurm_distributed_kmeans
+```
+asks SLURM for 5 machines with 4 GPUs each with the `srun` command.
+All 5 machines run the script with the `slurm_within_kmeans_server` option.
+They determine the number of servers and their own server id via the `SLURM_NPROCS` and `SLURM_PROCID` environment variables.
+
+All machines start `distributed_kmeans.py` in server mode for the slice of the dataset they are responsible for.
+
+In addition, the machine #0 also starts the client.
+The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`.
+It connects to all clients and performs the clustering.
+
+The output should look like [this gist](https://gist.github.com/mdouze/8d25e89fb4af5093057cae0f917da6cd).
+
+### Run used for deep1B
+
+For the real run, we run the clustering on 50M vectors to 1M centroids.
+This is just a matter of using as many machines / GPUs as possible in setting the output centroids with the `--out filename` option.
+Then run
+```bash
+bash run_on_cluster.bash deep1b_clustering
+```
+
+The last lines of output read like:
+```bash
+  Iteration 19 (898.92 s, search 875.71 s): objective=1.33601e+07 imbalance=1.303 nsplit=0
+ 0: writing centroids to /checkpoint/matthijs/ondisk_distributed/1M_centroids.npy
+```
+
+This means that the total training time was 899s, of which 876s were used for computation.
+However, the computation includes the I/O overhead to the assignment servers.
+In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage.
+This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI).
+However, it is a simple implementation that should run on most clusters.
+
+## Making the trained index
+
+After the centroids are obtained, an empty trained index must be constructed.
+This is done by:
+
+- applying a pre-processing stage (a random rotation) to balance the dimensions of the vectors. This can be done after clustering, the clusters are just rotated as well.
+
+- wrapping the centroids into a HNSW index to speed up the CPU-based assignment of vectors
+
+- training the 6-bit scalar quantizer used to encode the vectors
+
+This is performed by the script [`make_trained_index.py`](make_trained_index.py).
+
+## Building the index by slices
+
+We call the slices "vslices" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across database partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
+
+The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index.
+There are 200 slices of 5M vectors each for Deep1B.
+It can be run in a brute-force parallel fashion, there is no constraint on ordering.
+To run the script in parallel on a slurm cluster, use:
+```bash
+bash run_on_cluster.bash make_index_vslices
+```
+For a real dataset, the data would be read from a DBMS.
+In that case, reading the data and indexing it in parallel is worthwhile because reading is very slow.
+
+## Splitting across inverted lists
+
+The 200 slices need to be merged together.
+This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice.
+We slice the inverted lists into 50 horizontal slices.
+This is run with
+```bash
+bash run_on_cluster.bash make_index_hslices
+```
+
+## Querying the index
+
+At this point the index is ready.
+The horizontal slices need to be loaded in the right order and combined into an index to be usable.
+This is done in the [combined_index.py](combined_index.py) script.
+It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched.
+To test, run:
+```bash
+python combined_index.py
+```
+The output should look like:
+```bash
+(faiss_1.5.2) matthijs@devfair0144:~/faiss_versions/faiss_1Tcode/faiss/benchs/distributed_ondisk$ python combined_index.py
+reading /checkpoint/matthijs/ondisk_distributed//hslices/slice49.faissindex
+loading empty index /checkpoint/matthijs/ondisk_distributed/trained.faissindex
+replace invlists
+loaded index of size  1000000000
+nprobe=1 1-recall@1=0.2904 t=12.35s
+nnprobe=10 1-recall@1=0.6499 t=17.67s
+nprobe=100 1-recall@1=0.8673 t=29.23s
+nprobe=1000 1-recall@1=0.9132 t=129.58s
+```
+ie. searching is a lot slower than from RAM.
+
+## Distributed query
+
+To reduce the bandwidth required from the machine that does the queries, it is possible to split the search across several search servers.
+This way, only the effective results are returned to the main machine.
+
+The search client and server are implemented in [`search_server.py`](search_server.py).
+It can be used as a script to start a search server for `CombinedIndexDeep1B` or as a module to load the clients.
+
+The search servers can be started with
+```bash
+bash run_on_cluster.bash run_search_servers
+```
+(adjust to the number of servers that can be used).
+
+Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py).
+It connects to the servers and assigns subsets of inverted lists to visit to each of them.
+
+A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5).
+The number in MiB indicates the amount of data that is read from disk to perform the search.
+In this case, the scale of the dataset is too small for the distributed search to have much impact, but on datasets > 10x larger, the difference becomes more significant.
+
+## Conclusion
+
+This code contains the core components to make an index that scales up to 1T vectors.
+There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/combined_index.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/combined_index.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import numpy as np
+
+
+class CombinedIndex:
+    """
+    combines a set of inverted lists into a hstack
+    masks part of those lists
+    adds these inverted lists to an empty index that contains
+    the info on how to perform searches
+    """
+
+    def __init__(self, invlist_fnames, empty_index_fname,
+                 masked_index_fname=None):
+
+        self.indexes = indexes = []
+        ilv = faiss.InvertedListsPtrVector()
+
+        for fname in invlist_fnames:
+            if os.path.exists(fname):
+                print('reading', fname, end='\r', flush=True)
+                index = faiss.read_index(fname)
+                indexes.append(index)
+                il = faiss.extract_index_ivf(index).invlists
+            else:
+                raise AssertionError
+            ilv.push_back(il)
+        print()
+
+        self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
+        if masked_index_fname:
+            self.big_il_base = self.big_il
+            print('loading', masked_index_fname)
+            self.masked_index = faiss.read_index(
+                masked_index_fname,
+                faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+            self.big_il = faiss.MaskedInvertedLists(
+                faiss.extract_index_ivf(self.masked_index).invlists,
+                self.big_il_base)
+
+        print('loading empty index', empty_index_fname)
+        self.index = faiss.read_index(empty_index_fname)
+        ntotal = self.big_il.compute_ntotal()
+
+        print('replace invlists')
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.replace_invlists(self.big_il, False)
+        index_ivf.ntotal = self.index.ntotal = ntotal
+        index_ivf.parallel_mode = 1   # seems reasonable to do this all the time
+
+        quantizer = faiss.downcast_index(index_ivf.quantizer)
+        quantizer.hnsw.efSearch = 1024
+
+    ############################################################
+    # Expose fields and functions of the index as methods so that they
+    # can be called by RPC
+
+    def search(self, x, k):
+        return self.index.search(x, k)
+
+    def range_search(self, x, radius):
+        return self.index.range_search(x, radius)
+
+    def transform_and_assign(self, xq):
+        index = self.index
+
+        if isinstance(index, faiss.IndexPreTransform):
+            assert index.chain.size() == 1
+            vt = index.chain.at(0)
+            xq = vt.apply_py(xq)
+
+        # perform quantization
+        index_ivf = faiss.extract_index_ivf(index)
+        quantizer = index_ivf.quantizer
+        coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe)
+        return xq, list_nos, coarse_dis
+
+
+    def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        n, d = xq.shape
+        assert d == index_ivf.d
+        n2, d2 = list_nos.shape
+        assert list_nos.shape == coarse_dis.shape
+        assert n2 == n
+        assert d2 == index_ivf.nprobe
+        D = np.empty((n, k), dtype='float32')
+        I = np.empty((n, k), dtype='int64')
+        index_ivf.search_preassigned(
+            n, faiss.swig_ptr(xq), k,
+            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
+            faiss.swig_ptr(D), faiss.swig_ptr(I), False)
+        return D, I
+
+
+    def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        n, d = xq.shape
+        assert d == index_ivf.d
+        n2, d2 = list_nos.shape
+        assert list_nos.shape == coarse_dis.shape
+        assert n2 == n
+        assert d2 == index_ivf.nprobe
+        res = faiss.RangeSearchResult(n)
+
+        index_ivf.range_search_preassigned(
+            n, faiss.swig_ptr(xq), radius,
+            faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
+            res)
+
+        lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = faiss.rev_swig_ptr(res.distances, nd).copy()
+        I = faiss.rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
+    def set_nprobe(self, nprobe):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.nprobe = nprobe
+
+    def set_parallel_mode(self, pm):
+        index_ivf = faiss.extract_index_ivf(self.index)
+        index_ivf.parallel_mode = pm
+
+    def get_ntotal(self):
+        return self.index.ntotal
+
+    def set_prefetch_nthread(self, nt):
+        for idx in self.indexes:
+            il = faiss.downcast_InvertedLists(
+                faiss.extract_index_ivf(idx).invlists)
+            il.prefetch_nthread
+            il.prefetch_nthread = nt
+
+    def set_omp_num_threads(self, nt):
+        faiss.omp_set_num_threads(nt)
+
+class CombinedIndexDeep1B(CombinedIndex):
+    """ loads a CombinedIndex with the data from the big photodna index """
+
+    def __init__(self):
+        # set some paths
+        workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+        # empty index with the proper quantizer
+        indexfname = workdir + 'trained.faissindex'
+
+        # index that has some invlists that override the big one
+        masked_index_fname = None
+        invlist_fnames = [
+            '%s/hslices/slice%d.faissindex' % (workdir, i)
+            for i in range(50)
+        ]
+        CombinedIndex.__init__(self, invlist_fnames, indexfname, masked_index_fname)
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+if __name__ == '__main__':
+    import time
+    ci = CombinedIndexDeep1B()
+    print('loaded index of size ', ci.index.ntotal)
+
+    deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+
+    xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
+    gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
+    gt = ivecs_read(gt_fname)
+
+    for nprobe in 1, 10, 100, 1000:
+        ci.set_nprobe(nprobe)
+        t0 = time.time()
+        D, I = ci.search(xq, 100)
+        t1 = time.time()
+        print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
+            nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
+            t1 - t0
+        ))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_kmeans.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_kmeans.py
@@ -0,0 +1,239 @@
+#! /usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Simple distributed kmeans implementation Relies on an abstraction
+for the training matrix, that can be sharded over several machines.
+"""
+import os
+import sys
+import argparse
+
+import numpy as np
+
+import faiss
+
+from multiprocessing.pool import ThreadPool
+from faiss.contrib import rpc
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.vecs_io import bvecs_mmap, fvecs_mmap
+from faiss.contrib.clustering import DatasetAssign, DatasetAssignGPU, kmeans
+
+
+class DatasetAssignDispatch:
+    """dispatches to several other DatasetAssigns and combines the
+    results"""
+
+    def __init__(self, xes, in_parallel):
+        self.xes = xes
+        self.d = xes[0].dim()
+        if not in_parallel:
+            self.imap = map
+        else:
+            self.pool = ThreadPool(len(self.xes))
+            self.imap = self.pool.imap
+        self.sizes = list(map(lambda x: x.count(), self.xes))
+        self.cs = np.cumsum([0] + self.sizes)
+
+    def count(self):
+        return self.cs[-1]
+
+    def dim(self):
+        return self.d
+
+    def get_subset(self, indices):
+        res = np.zeros((len(indices), self.d), dtype='float32')
+        nos = np.searchsorted(self.cs[1:], indices, side='right')
+
+        def handle(i):
+            mask = nos == i
+            sub_indices = indices[mask] - self.cs[i]
+            subset = self.xes[i].get_subset(sub_indices)
+            res[mask] = subset
+
+        list(self.imap(handle, range(len(self.xes))))
+        return res
+
+    def assign_to(self, centroids, weights=None):
+        src = self.imap(
+            lambda x: x.assign_to(centroids, weights),
+            self.xes
+        )
+        I = []
+        D = []
+        sum_per_centroid = None
+        for Ii, Di, sum_per_centroid_i in src:
+            I.append(Ii)
+            D.append(Di)
+            if sum_per_centroid is None:
+                sum_per_centroid = sum_per_centroid_i
+            else:
+                sum_per_centroid += sum_per_centroid_i
+        return np.hstack(I), np.hstack(D), sum_per_centroid
+
+
+class AssignServer(rpc.Server):
+    """ Assign version that can be exposed via RPC """
+
+    def __init__(self, s, assign, log_prefix=''):
+        rpc.Server.__init__(self, s, log_prefix=log_prefix)
+        self.assign = assign
+
+    def __getattr__(self, f):
+        return getattr(self.assign, f)
+
+
+
+
+def do_test(todo):
+
+    testdata = '/datasets01_101/simsearch/041218/bigann/bigann_learn.bvecs'
+
+    if os.path.exists(testdata):
+        x = bvecs_mmap(testdata)
+    else:
+        print("using synthetic dataset")
+        ds = SyntheticDataset(128, 100000, 0, 0)
+        x = ds.get_train()
+
+    # bad distribution to stress-test split code
+    xx = x[:100000].copy()
+    xx[:50000] = x[0]
+
+    todo = sys.argv[1:]
+
+    if "0" in todo:
+        # reference C++ run
+        km = faiss.Kmeans(x.shape[1], 1000, niter=20, verbose=True)
+        km.train(xx.astype('float32'))
+
+    if "1" in todo:
+        # using the Faiss c++ implementation
+        data = DatasetAssign(xx)
+        kmeans(1000, data, 20)
+
+    if "2" in todo:
+        # use the dispatch object (on local datasets)
+        data = DatasetAssignDispatch([
+            DatasetAssign(xx[20000 * i : 20000 * (i + 1)])
+            for i in range(5)
+            ], False
+        )
+        kmeans(1000, data, 20)
+
+    if "3" in todo:
+        # same, with GPU
+        ngpu = faiss.get_num_gpus()
+        print('using %d GPUs' % ngpu)
+        data = DatasetAssignDispatch([
+            DatasetAssignGPU(xx[100000 * i // ngpu: 100000 * (i + 1) // ngpu], i)
+            for i in range(ngpu)
+            ], True
+        )
+        kmeans(1000, data, 20)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('general options')
+    aa('--test', default='', help='perform tests (comma-separated numbers)')
+
+    aa('--k', default=0, type=int, help='nb centroids')
+    aa('--seed', default=1234, type=int, help='random seed')
+    aa('--niter', default=20, type=int, help='nb iterations')
+    aa('--gpu', default=-2, type=int, help='GPU to use (-2:none, -1: all)')
+
+    group = parser.add_argument_group('I/O options')
+    aa('--indata', default='',
+       help='data file to load (supported formats fvecs, bvecs, npy')
+    aa('--i0', default=0, type=int, help='first vector to keep')
+    aa('--i1', default=-1, type=int, help='last vec to keep + 1')
+    aa('--out', default='', help='file to store centroids')
+    aa('--store_each_iteration', default=False, action='store_true',
+       help='store centroid checkpoints')
+
+    group = parser.add_argument_group('server options')
+    aa('--server', action='store_true', default=False, help='run server')
+    aa('--port', default=12345, type=int, help='server port')
+    aa('--when_ready', default=None, help='store host:port to this file when ready')
+    aa('--ipv4', default=False, action='store_true', help='force ipv4')
+
+    group = parser.add_argument_group('client options')
+    aa('--client', action='store_true', default=False, help='run client')
+    aa('--servers', default='', help='list of server:port separated by spaces')
+
+    args = parser.parse_args()
+
+    if args.test:
+        do_test(args.test.split(','))
+        return
+
+    # prepare data matrix (either local or remote)
+    if args.indata:
+        print('loading ', args.indata)
+        if args.indata.endswith('.bvecs'):
+            x = bvecs_mmap(args.indata)
+        elif args.indata.endswith('.fvecs'):
+            x = fvecs_mmap(args.indata)
+        elif args.indata.endswith('.npy'):
+            x = np.load(args.indata, mmap_mode='r')
+        else:
+            raise AssertionError
+
+        if args.i1 == -1:
+            args.i1 = len(x)
+        x = x[args.i0:args.i1]
+        if args.gpu == -2:
+            data = DatasetAssign(x)
+        else:
+            print('moving to GPU')
+            data = DatasetAssignGPU(x, args.gpu)
+
+    elif args.client:
+        print('connecting to servers')
+
+        def connect_client(hostport):
+            host, port = hostport.split(':')
+            port = int(port)
+            print('connecting %s:%d' % (host, port))
+            client = rpc.Client(host, port, v6=not args.ipv4)
+            print('client %s:%d ready' % (host, port))
+            return client
+
+        hostports = args.servers.strip().split(' ')
+        # pool = ThreadPool(len(hostports))
+
+        data = DatasetAssignDispatch(
+            list(map(connect_client, hostports)),
+            True
+        )
+    else:
+        raise AssertionError
+
+
+    if args.server:
+        print('starting server')
+        log_prefix = f"{rpc.socket.gethostname()}:{args.port}"
+        rpc.run_server(
+            lambda s: AssignServer(s, data, log_prefix=log_prefix),
+            args.port, report_to_file=args.when_ready,
+            v6=not args.ipv4)
+
+    else:
+        print('running kmeans')
+        centroids = kmeans(args.k, data, niter=args.niter, seed=args.seed,
+                           checkpoint=args.out if args.store_each_iteration else None)
+        if args.out != '':
+            print('writing centroids to', args.out)
+            np.save(args.out, centroids)
+
+
+if __name__ == '__main__':
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_query_demo.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/distributed_query_demo.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import numpy as np
+import time
+import rpc
+import sys
+
+import combined_index
+import search_server
+
+hostnames = sys.argv[1:]
+
+print("Load local index")
+ci = combined_index.CombinedIndexDeep1B()
+
+print("connect to clients")
+clients = []
+for host in hostnames:
+    client = rpc.Client(host, 12012, v6=False)
+    clients.append(client)
+
+# check if all servers respond
+print("sizes seen by servers:", [cl.get_ntotal() for cl in clients])
+
+
+# aggregate all clients into a one that uses them all for speed
+# note that it also requires a local index ci
+sindex = search_server.SplitPerListIndex(ci, clients)
+sindex.verbose = True
+
+# set reasonable parameters
+ci.set_parallel_mode(1)
+ci.set_prefetch_nthread(0)
+ci.set_omp_num_threads(64)
+
+# initialize params
+sindex.set_parallel_mode(1)
+sindex.set_prefetch_nthread(0)
+sindex.set_omp_num_threads(64)
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+
+xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
+gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
+gt = ivecs_read(gt_fname)
+
+
+for nprobe in 1, 10, 100, 1000:
+    sindex.set_nprobe(nprobe)
+    t0 = time.time()
+    D, I = sindex.search(xq, 100)
+    t1 = time.time()
+    print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
+        nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
+        t1 - t0
+    ))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_index_vslice.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_index_vslice.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+import numpy as np
+import faiss
+import argparse
+from multiprocessing.pool import ThreadPool
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def produce_batches(args):
+
+    x = fvecs_mmap(args.input)
+
+    if args.i1 == -1:
+        args.i1 = len(x)
+
+    print("Iterating on vectors %d:%d from %s by batches of size %d" % (
+        args.i0, args.i1, args.input, args.bs))
+
+    for j0 in range(args.i0, args.i1, args.bs):
+        j1 = min(j0 + args.bs, args.i1)
+        yield np.arange(j0, j1), x[j0:j1]
+
+
+def rate_limited_iter(l):
+    'a thread pre-processes the next element'
+    pool = ThreadPool(1)
+    res = None
+
+    def next_or_None():
+        try:
+            return next(l)
+        except StopIteration:
+            return None
+
+    while True:
+        res_next = pool.apply_async(next_or_None)
+        if res is not None:
+            res = res.get()
+            if res is None:
+                return
+            yield res
+        res = res_next
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='make index for a subset of the data')
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('index type')
+    aa('--inputindex',
+       default=workdir + 'trained.faissindex',
+       help='empty input index to fill in')
+    aa('--nt', default=-1, type=int, help='nb of openmp threads to use')
+
+    group = parser.add_argument_group('db options')
+    aa('--input', default=deep1bdir + "base.fvecs")
+    aa('--bs', default=2**18, type=int,
+       help='batch size for db access')
+    aa('--i0', default=0, type=int, help='lower bound to index')
+    aa('--i1', default=-1, type=int, help='upper bound of vectors to index')
+
+    group = parser.add_argument_group('output')
+    aa('-o', default='/tmp/x', help='output index')
+    aa('--keepquantizer', default=False, action='store_true',
+       help='by default we remove the data from the quantizer to save space')
+
+    args = parser.parse_args()
+    print('args=', args)
+
+    print('start accessing data')
+    src = produce_batches(args)
+
+    print('loading index', args.inputindex)
+    index = faiss.read_index(args.inputindex)
+
+    if args.nt != -1:
+        faiss.omp_set_num_threads(args.nt)
+
+    t0 = time.time()
+    ntot = 0
+    for ids, x in rate_limited_iter(src):
+        print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
+        index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
+        ntot += ids.size
+
+    index_ivf = faiss.extract_index_ivf(index)
+    print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor())
+    index_ivf.invlists.print_stats()
+
+    if not args.keepquantizer:
+        print('resetting quantizer content')
+        index_ivf = faiss.extract_index_ivf(index)
+        index_ivf.quantizer.reset()
+
+    print('store output', args.o)
+    faiss.write_index(index, args.o)
+
+if __name__ == '__main__':
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_trained_index.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/make_trained_index.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import faiss
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+
+print('Load centroids')
+centroids = np.load(workdir + '1M_centroids.npy')
+ncent, d = centroids.shape
+
+
+print('apply random rotation')
+rrot = faiss.RandomRotationMatrix(d, d)
+rrot.init(1234)
+centroids = rrot.apply_py(centroids)
+
+print('make HNSW index as quantizer')
+quantizer = faiss.IndexHNSWFlat(d, 32)
+quantizer.hnsw.efSearch = 1024
+quantizer.hnsw.efConstruction = 200
+quantizer.add(centroids)
+
+print('build index')
+index = faiss.IndexPreTransform(
+    rrot,
+    faiss.IndexIVFScalarQuantizer(
+        quantizer, d, ncent, faiss.ScalarQuantizer.QT_6bit
+        )
+    )
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+print('finish training index')
+xt = fvecs_mmap(deep1bdir + 'learn.fvecs')
+xt = np.ascontiguousarray(xt[:256 * 1000], dtype='float32')
+index.train(xt)
+
+print('write output')
+faiss.write_index(index, workdir + 'trained.faissindex')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import argparse
+from multiprocessing.pool import ThreadPool
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputs', nargs='*', required=True,
+                        help='input indexes to merge')
+    parser.add_argument('--l0', type=int, default=0)
+    parser.add_argument('--l1', type=int, default=-1)
+
+    parser.add_argument('--nt', default=-1,
+                        help='nb threads')
+
+    parser.add_argument('--output', required=True,
+                        help='output index filename')
+    parser.add_argument('--outputIL',
+                        help='output invfile filename')
+
+    args = parser.parse_args()
+
+    if args.nt != -1:
+        print('set nb of threads to', args.nt)
+
+
+    ils = faiss.InvertedListsPtrVector()
+    ils_dont_dealloc = []
+
+    pool = ThreadPool(20)
+
+    def load_index(fname):
+        print("loading", fname)
+        try:
+            index = faiss.read_index(fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+        except RuntimeError as e:
+            print('could not load %s: %s' % (fname, e))
+            return fname, None
+
+        print("  %d entries" % index.ntotal)
+        return fname, index
+
+    index0 = None
+
+    for _, index in pool.imap(load_index, args.inputs):
+        if index is None:
+            continue
+        index_ivf = faiss.extract_index_ivf(index)
+        il = faiss.downcast_InvertedLists(index_ivf.invlists)
+        index_ivf.invlists = None
+        il.this.own()
+        ils_dont_dealloc.append(il)
+        if (args.l0, args.l1) != (0, -1):
+            print('restricting to lists %d:%d' % (args.l0, args.l1))
+            # il = faiss.SliceInvertedLists(il, args.l0, args.l1)
+
+            il.crop_invlists(args.l0, args.l1)
+            ils_dont_dealloc.append(il)
+        ils.push_back(il)
+
+        if index0 is None:
+            index0 = index
+
+    print("loaded %d invlists" % ils.size())
+
+    if not args.outputIL:
+        args.outputIL = args.output + '_invlists'
+
+    il0 = ils.at(0)
+
+    il = faiss.OnDiskInvertedLists(
+        il0.nlist, il0.code_size,
+        args.outputIL)
+
+    print("perform merge")
+
+    ntotal = il.merge_from(ils.data(), ils.size(), True)
+
+    print("swap into index0")
+
+    index0_ivf = faiss.extract_index_ivf(index0)
+    index0_ivf.nlist = il0.nlist
+    index0_ivf.ntotal = index0.ntotal = ntotal
+    index0_ivf.invlists = il
+    index0_ivf.own_invlists = False
+
+    print("write", args.output)
+
+    faiss.write_index(index0, args.output)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/run_on_cluster.bash
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/run_on_cluster.bash
@@ -0,0 +1,263 @@
+#! /bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+todo=$1
+# other options can be transmitted
+shift
+
+# the training data of the Deep1B dataset
+deep1bdir=/datasets01_101/simsearch/041218/deep1b
+traindata=$deep1bdir/learn.fvecs
+
+# this is for small tests
+nvec=1000000
+k=4000
+
+# for the real run
+# nvec=50000000
+# k=1000000
+
+# working directory for the real run
+workdir=/checkpoint/matthijs/ondisk_distributed
+mkdir -p $workdir/{vslices,hslices}
+
+if [ -z "$todo" ]; then
+    echo "nothing to do"
+    exit 1
+elif [ $todo == test_kmeans_0 ]; then
+    # non distributed baseline
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k
+
+elif [ $todo == test_kmeans_1 ]; then
+    # using all the machine's GPUs
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k --gpu -1
+
+elif [ $todo == test_kmeans_2 ]; then
+    # distrbuted run, with one local server per GPU
+    ngpu=$( echo /dev/nvidia? | wc -w )
+    baseport=12012
+
+    # kill background porcesses on output of this script
+    trap 'kill -HUP 0' 0
+
+    hostports=''
+
+    for((gpu=0;gpu<ngpu;gpu++)); do
+        # range of vectors to assign to each sever
+        i0=$((nvec * gpu / ngpu))
+        i1=$((nvec * (gpu + 1) / ngpu))
+        port=$(( baseport + gpu ))
+
+        echo "start server $gpu for range $i0:$i1"
+
+        python distributed_kmeans.py \
+               --indata $traindata \
+               --i0 $i0 --i1 $i1 \
+               --server --gpu $gpu \
+               --port $port --ipv4 &
+
+        hostports="$hostports localhost:$port"
+    done
+
+    # lame way of making sure all servers are running
+    sleep 5s
+
+    python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4
+
+elif [ $todo == slurm_distributed_kmeans ]; then
+
+    nserv=5
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server
+
+elif [ $todo == slurm_within_kmeans_server ]; then
+
+   nserv=$SLURM_NPROCS
+   [ ! -z "$nserv" ] || (echo "should be run by slurm"; exit 1)
+   rank=$SLURM_PROCID
+
+   baseport=12012
+
+   i0=$((nvec * rank / nserv))
+   i1=$((nvec * (rank + 1) / nserv))
+   port=$(( baseport + rank ))
+
+   echo "host $(hostname) start server $rank for range $i0:$i1 port $port"
+
+   if [ $rank != 0 ]; then
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4
+   else
+       # master process
+
+       # kill background processes on output of this script
+       trap 'kill -HUP 0' 0
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4 &
+
+       # Slurm has a somewhat convoluted way of specifying the nodes
+       # assigned to each task. This is to parse the SLURM_TASKS_PER_NODE variable
+       function parse_tasks_per_node () {
+           local blocks=$1
+           for block in ${blocks//,/ }; do
+               if [ ${block/x/} != $block ]; then
+                   tpn="${block%(*}"
+                   repeat=${block#*x}
+                   repeat=${repeat%?}
+                   for((i=0;i<repeat;i++)); do
+                       echo $tpn
+                   done
+               else
+                   echo $block
+               fi
+            done
+       }
+
+       hostports=""
+       port=$baseport
+       echo VARS $SLURM_TASKS_PER_NODE $SLURM_JOB_NODELIST
+       tasks_per_node=( $( parse_tasks_per_node $SLURM_TASKS_PER_NODE ) )
+       nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+       n=${#nodes[*]}
+       for((i=0;i<n;i++)); do
+           hostname=${nodes[i]}
+           for((j=0;j<tasks_per_node[i];j++)); do
+               hostports="$hostports $hostname:$port"
+               ((port++))
+           done
+       done
+
+       echo HOSTPORTS $hostports
+
+       sleep 20s
+
+       # run client
+       python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4 "$@"
+
+       echo "Done, kill the job"
+       scancel $SLURM_JOBID
+
+   fi
+
+elif [ $todo == deep1b_clustering ]; then
+    # also set nvec=500M and k=10M in the top of the file
+    nserv=20
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server \
+         --out $workdir/1M_centroids.npy
+
+elif [ $todo == make_index_vslices ]; then
+
+    # vslice: slice per database shards
+
+    nvec=1000000000
+    nslice=200
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nvec * i / nslice))
+        i1=$((nvec * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/vslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u make_index_vslice.py \
+                 --inputindex $workdir/trained.faissindex \
+                 --input $deep1bdir/base.fvecs \
+                 --nt 40 \
+                 --i0 $i0 --i1 $i1 \
+                 -o $workdir/vslices/slice$i.faissindex
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=40 --gres=gpu:0 --mem=200G \
+             --output=$workdir/vslices/slice$i.log \
+             --job-name=vslice$i.c \
+             $workdir/vslices/slice$i.bash
+        echo "logs in $workdir/vslices/slice$i.log"
+
+    done
+
+elif [ $todo == make_index_hslices ]; then
+
+    # hslice: slice per inverted lists
+
+    nlist=1000000
+    nslice=50
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nlist * i / nslice))
+        i1=$((nlist * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/hslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u merge_to_ondisk.py \
+                 --input $workdir/vslices/slice{0..199}.faissindex \
+                 --nt 20 \
+                 --l0 $i0 --l1 $i1 \
+                 --output $workdir/hslices/slice$i.faissindex \
+                 --outputIL $workdir/hslices/slice$i.invlists
+
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=20 --gres=gpu:0 --mem=200G \
+             --output=$workdir/hslices/slice$i.log \
+             --job-name=hslice$i.a \
+             --constraint=pascal \
+             $workdir/hslices/slice$i.bash
+        echo "logs in $workdir/hslices/slice$i.log"
+
+    done
+
+elif [ $todo == run_search_servers ]; then
+
+    nserv=3
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=64 --gres=gpu:0 --mem=100G \
+         --constraint=pascal \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l python -u search_server.py --port 12012
+
+
+else
+    echo "unknown todo $todo"
+    exit 1
+fi
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/search_server.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/distributed_ondisk/search_server.py
@@ -0,0 +1,222 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+
+from faiss.contrib import rpc
+
+import combined_index
+import argparse
+
+
+
+############################################################
+# Server implementation
+############################################################
+
+
+class MyServer(rpc.Server):
+    """ Assign version that can be exposed via RPC """
+    def __init__(self, s, index):
+        rpc.Server.__init__(self, s)
+        self.index = index
+
+    def __getattr__(self, f):
+        return getattr(self.index, f)
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('server options')
+    aa('--port', default=12012, type=int, help='server port')
+    aa('--when_ready_dir', default=None,
+       help='store host:port to this file when ready')
+    aa('--ipv4', default=False, action='store_true', help='force ipv4')
+    aa('--rank', default=0, type=int,
+       help='rank used as index in the client table')
+
+    args = parser.parse_args()
+
+    when_ready = None
+    if args.when_ready_dir:
+        when_ready = '%s/%d' % (args.when_ready_dir, args.rank)
+
+    print('loading index')
+
+    index = combined_index.CombinedIndexDeep1B()
+
+    print('starting server')
+    rpc.run_server(
+        lambda s: MyServer(s, index),
+        args.port, report_to_file=when_ready,
+        v6=not args.ipv4)
+
+if __name__ == '__main__':
+    main()
+
+
+############################################################
+# Client implementation
+############################################################
+
+from multiprocessing.pool import ThreadPool
+import faiss
+import numpy as np
+
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset (for k-nn search) """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+def distribute_weights(weights, nbin):
+    """ assign a set of weights to a smaller set of bins to balance them """
+    nw = weights.size
+    o = weights.argsort()
+    bins = np.zeros(nbin)
+    assign = np.ones(nw, dtype=int)
+    for i in o[::-1]:
+        b = bins.argmin()
+        assign[i] = b
+        bins[b] += weights[i]
+    return bins, assign
+
+
+
+class SplitPerListIndex:
+    """manages a local index, that does the coarse quantization and a set
+    of sub_indexes. The sub_indexes search a subset of the inverted
+    lists. The SplitPerListIndex merges results from the sub-indexes"""
+
+    def __init__(self, index, sub_indexes):
+        self.index = index
+        self.code_size = faiss.extract_index_ivf(index.index).code_size
+        self.sub_indexes = sub_indexes
+        self.ni = len(self.sub_indexes)
+        # pool of threads. Each thread manages one sub-index.
+        self.pool = ThreadPool(self.ni)
+        self.verbose = False
+
+    def set_nprobe(self, nprobe):
+        self.index.set_nprobe(nprobe)
+        self.pool.map(
+            lambda i: self.sub_indexes[i].set_nprobe(nprobe),
+            range(self.ni)
+        )
+
+    def set_omp_num_threads(self, nt):
+        faiss.omp_set_num_threads(nt)
+        self.pool.map(
+            lambda idx: idx.set_omp_num_threads(nt),
+            self.sub_indexes
+        )
+
+    def set_parallel_mode(self, pm):
+        self.index.set_parallel_mode(pm)
+        self.pool.map(
+            lambda idx: idx.set_parallel_mode(pm),
+            self.sub_indexes
+        )
+
+    def set_prefetch_nthread(self, nt):
+        self.index.set_prefetch_nthread(nt)
+        self.pool.map(
+            lambda idx: idx.set_prefetch_nthread(nt),
+            self.sub_indexes
+        )
+
+    def balance_lists(self, list_nos):
+        big_il = self.index.big_il
+        weights = np.array([big_il.list_size(int(i))
+                            for i in list_nos.ravel()])
+        bins, assign = distribute_weights(weights, self.ni)
+        if self.verbose:
+            print('bins weight range %d:%d total %d (%.2f MiB)' % (
+                bins.min(), bins.max(), bins.sum(),
+                bins.sum() * (self.code_size + 8) / 2 ** 20))
+        self.nscan = bins.sum()
+        return assign.reshape(list_nos.shape)
+
+    def search(self, x, k):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            Di, Ii = sub_index.ivf_search_preassigned(
+                xqo, list_nos_i, coarse_dis, k)
+            #print(list_nos_i, Ii)
+            if self.verbose:
+                print('client %d: %.3f s' % (i, time.time() - t0))
+            return Di, Ii
+
+        rh = ResultHeap(x.shape[0], k)
+
+        for Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            #print("ADD", Ii, rh.I)
+            rh.add_batch_result(Di, Ii, 0)
+        rh.finalize()
+        return rh.D, rh.I
+
+    def range_search(self, x, radius):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+        nq = len(x)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            limi, Di, Ii = sub_index.ivf_range_search_preassigned(
+                xqo, list_nos_i, coarse_dis, radius)
+            if self.verbose:
+                print('slice %d: %.3f s' % (i, time.time() - t0))
+            return limi, Di, Ii
+
+        D = [[] for i in range(nq)]
+        I = [[] for i in range(nq)]
+
+        sizes = np.zeros(nq, dtype=int)
+        for lims, Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            for i in range(nq):
+                l0, l1 = lims[i:i + 2]
+                D[i].append(Di[l0:l1])
+                I[i].append(Ii[l0:l1])
+                sizes[i] += l1 - l0
+        lims = np.zeros(nq + 1, dtype=int)
+        lims[1:] = np.cumsum(sizes)
+        D = np.hstack([j for i in D for j in i])
+        I = np.hstack([j for i in I for j in i])
+        return lims, D, I
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/kmeans_mnist.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/kmeans_mnist.py
@@ -0,0 +1,88 @@
+#! /usr/bin/env python2
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import numpy as np
+import time
+import faiss
+import sys
+
+
+# Get command-line arguments
+
+k = int(sys.argv[1])
+ngpu = int(sys.argv[2])
+
+# Load Leon's file format
+
+def load_mnist(fname):
+    print("load", fname)
+    f = open(fname)
+
+    header = np.fromfile(f, dtype='int8', count=4*4)
+    header = header.reshape(4, 4)[:, ::-1].copy().view('int32')
+    print(header)
+    nim, xd, yd = [int(x) for x in header[1:]]
+
+    data = np.fromfile(f, count=nim * xd * yd,
+                       dtype='uint8')
+
+    print(data.shape, nim, xd, yd)
+    data = data.reshape(nim, xd, yd)
+    return data
+
+basedir = "/path/to/mnist/data"
+
+x = load_mnist(basedir + 'mnist8m/mnist8m-patterns-idx3-ubyte')
+
+print("reshape")
+
+x = x.reshape(x.shape[0], -1).astype('float32')
+
+
+def train_kmeans(x, k, ngpu):
+    "Runs kmeans on one or several GPUs"
+    d = x.shape[1]
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    clus.niter = 20
+
+    # otherwise the kmeans implementation sub-samples the training set
+    clus.max_points_per_centroid = 10000000
+
+    res = [faiss.StandardGpuResources() for i in range(ngpu)]
+
+    flat_config = []
+    for i in range(ngpu):
+        cfg = faiss.GpuIndexFlatConfig()
+        cfg.useFloat16 = False
+        cfg.device = i
+        flat_config.append(cfg)
+
+    if ngpu == 1:
+        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
+    else:
+        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
+                   for i in range(ngpu)]
+        index = faiss.IndexReplicas()
+        for sub_index in indexes:
+            index.addIndex(sub_index)
+
+    # perform the training
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    obj = faiss.vector_float_to_array(clus.obj)
+    print("final objective: %.4g" % obj[-1])
+
+    return centroids.reshape(k, d)
+
+print("run")
+t0 = time.time()
+train_kmeans(x, k, ngpu)
+t1 = time.time()
+
+print("total runtime: %.3f s" % (t1 - t0))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/link_and_code/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/link_and_code/README.md
@@ -0,0 +1,25 @@
+
+
+README for the link & code implementation
+=========================================
+
+What is this?
+-------------
+
+Link & code is an indexing method that combines HNSW indexing with
+compression and exploits the neighborhood structure of the similarity
+graph to improve the reconstruction. It is described in
+
+```
+@inproceedings{link_and_code,
+   author = {Matthijs Douze and Alexandre Sablayrolles and Herv\'e J\'egou},
+   title = {Link and code: Fast indexing with graphs and compact regression codes},
+   booktitle = {CVPR},
+   year = {2018}
+}
+```
+
+ArXiV [here](https://arxiv.org/abs/1804.09996)
+
+The necessary code for this paper was removed from Faiss in version 1.8.0.
+For a functioning verinsion, use Faiss 1.7.4.
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "AutoTune_c.h"
+#include <faiss/AutoTune.h>
+#include <cstring>
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::ParameterRange;
+using faiss::ParameterSpace;
+
+const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
+    return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
+}
+
+void faiss_ParameterRange_values(
+        FaissParameterRange* range,
+        double** p_values,
+        size_t* p_size) {
+    auto& values = reinterpret_cast<ParameterRange*>(range)->values;
+    *p_values = values.data();
+    *p_size = values.size();
+}
+
+int faiss_ParameterSpace_new(FaissParameterSpace** space) {
+    try {
+        auto new_space = new ParameterSpace();
+        *space = reinterpret_cast<FaissParameterSpace*>(new_space);
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(ParameterSpace)
+
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace* space) {
+    return reinterpret_cast<const ParameterSpace*>(space)->n_combinations();
+}
+
+int faiss_ParameterSpace_combination_name(
+        const FaissParameterSpace* space,
+        size_t cno,
+        char* char_buffer,
+        size_t size) {
+    try {
+        auto rep = reinterpret_cast<const ParameterSpace*>(space)
+                           ->combination_name(cno);
+        strncpy(char_buffer, rep.c_str(), size);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameters(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        const char* param_string) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
+                index, param_string);
+    }
+    CATCH_AND_HANDLE
+}
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        size_t cno) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
+                index, cno);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameter(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        const char* name,
+        double value) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameter(
+                index, name, value);
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_ParameterSpace_display(const FaissParameterSpace* space) {
+    reinterpret_cast<const ParameterSpace*>(space)->display();
+}
+
+int faiss_ParameterSpace_add_range(
+        FaissParameterSpace* space,
+        const char* name,
+        FaissParameterRange** p_range) {
+    try {
+        ParameterRange& range =
+                reinterpret_cast<ParameterSpace*>(space)->add_range(name);
+        if (p_range) {
+            *p_range = reinterpret_cast<FaissParameterRange*>(&range);
+        }
+    }
+    CATCH_AND_HANDLE
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.h
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/AutoTune_c.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c -*-
+
+#ifndef FAISS_AUTO_TUNE_C_H
+#define FAISS_AUTO_TUNE_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+FAISS_DECLARE_CLASS(ParameterRange)
+
+FAISS_DECLARE_GETTER(ParameterRange, const char*, name)
+
+/// Getter for the values in the range. The output values are invalidated
+/// upon any other modification of the range.
+void faiss_ParameterRange_values(FaissParameterRange*, double**, size_t*);
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+FAISS_DECLARE_CLASS(ParameterSpace)
+
+FAISS_DECLARE_DESTRUCTOR(ParameterSpace)
+
+/// Parameter space default constructor
+int faiss_ParameterSpace_new(FaissParameterSpace** space);
+
+/// nb of combinations, = product of values sizes
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace*);
+
+/// get string representation of the combination
+/// by writing it to the given character buffer.
+/// A buffer size of 1000 ensures that the full name is collected.
+int faiss_ParameterSpace_combination_name(
+        const FaissParameterSpace*,
+        size_t,
+        char*,
+        size_t);
+
+/// set a combination of parameters described by a string
+int faiss_ParameterSpace_set_index_parameters(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        const char*);
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        size_t);
+
+/// set one of the parameters
+int faiss_ParameterSpace_set_index_parameter(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        const char*,
+        double);
+
+/// print a description on stdout
+void faiss_ParameterSpace_display(const FaissParameterSpace*);
+
+/// add a new parameter (or return it if it exists)
+int faiss_ParameterSpace_add_range(
+        FaissParameterSpace*,
+        const char*,
+        FaissParameterRange**);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/CMakeLists.txt
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+
+project(faiss_c_library LANGUAGES C CXX)
+
+set(CMAKE_C_STANDARD 11)
+
+set(FAISS_C_SRC
+  AutoTune_c.cpp
+  Clustering_c.cpp
+  IndexFlat_c.cpp
+  IndexIVFFlat_c.cpp
+  IndexIVF_c.cpp
+  IndexLSH_c.cpp
+  IndexPreTransform_c.cpp
+  VectorTransform_c.cpp
+  IndexShards_c.cpp
+  IndexReplicas_c.cpp
+  Index_c.cpp
+  IndexBinary_c.cpp
+  IndexScalarQuantizer_c.cpp
+  MetaIndexes_c.cpp
+  clone_index_c.cpp
+  error_impl.cpp
+  index_factory_c.cpp
+  index_io_c.cpp
+  impl/AuxIndexStructures_c.cpp
+  utils/distances_c.cpp
+  utils/utils_c.cpp
+)
+
+add_library(faiss_c ${FAISS_C_SRC})
+target_link_libraries(faiss_c PRIVATE faiss)
+
+add_library(faiss_c_avx2 ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx2 PRIVATE faiss_avx2)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(faiss_c_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+else()
+  # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
+  # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
+  # Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
+  # Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
+  target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+endif()
+
+add_library(faiss_c_avx512 ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx512 PRIVATE faiss_avx512)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
+  set_target_properties(faiss_c_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  # All modern CPUs support F, CD, VL, DQ, BW extensions.
+  # Ref: https://en.wikipedia.org/wiki/AVX512
+  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
+else()
+  target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+endif()
+
+add_library(faiss_c_avx512_spr ${FAISS_C_SRC})
+target_link_libraries(faiss_c_avx512_spr PRIVATE faiss_avx512_spr)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(faiss_c_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
+  # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
+  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
+else()
+  target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+endif()
+
+add_library(faiss_c_sve ${FAISS_C_SRC})
+target_link_libraries(faiss_c_sve PRIVATE faiss_sve)
+if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
+  set_target_properties(faiss_c_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
+    # Do nothing, expect SVE to be enabled by -march=native
+  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+    # Add +sve
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
+  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
+    # No valid -march, so specify -march=armv8-a+sve as the default
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
+  endif()
+  if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
+    # Do nothing, expect SVE to be enabled by -march=native
+  elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+    # Add +sve
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
+  elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
+    # No valid -march, so specify -march=armv8-a+sve as the default
+    target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
+  endif()
+endif()
+
+function(faiss_install_headers headers p)
+  foreach(h ${headers})
+    get_filename_component(f ${h} DIRECTORY)
+    install(FILES ${h}
+      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${p}/${f}
+    )
+  endforeach()
+endfunction()
+
+file(GLOB FAISS_C_API_HEADERS
+     RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+     "*.h"
+     "impl/*.h"
+     "utils/*.h")
+
+faiss_install_headers("${FAISS_C_API_HEADERS}" c_api)
+
+install(TARGETS faiss_c
+  EXPORT faiss-targets
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+if(FAISS_OPT_LEVEL STREQUAL "avx2")
+  install(TARGETS faiss_c_avx2
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "avx512")
+  install(TARGETS faiss_c_avx2 faiss_c_avx512
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  install(TARGETS faiss_c_avx2 faiss_c_avx512_spr
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+if(FAISS_OPT_LEVEL STREQUAL "sve")
+  install(TARGETS faiss_c_sve
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
+add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
+target_link_libraries(example_c PRIVATE faiss_c)
+
+if(FAISS_ENABLE_GPU)
+  if(FAISS_ENABLE_ROCM)
+    add_subdirectory(gpu-rocm)
+  else ()
+    add_subdirectory(gpu)
+  endif()
+endif()
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "Clustering_c.h"
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <vector>
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Clustering;
+using faiss::ClusteringIterationStats;
+using faiss::ClusteringParameters;
+using faiss::Index;
+
+DEFINE_GETTER(Clustering, int, niter)
+DEFINE_GETTER(Clustering, int, nredo)
+DEFINE_GETTER(Clustering, int, verbose)
+DEFINE_GETTER(Clustering, int, spherical)
+DEFINE_GETTER(Clustering, int, int_centroids)
+DEFINE_GETTER(Clustering, int, update_index)
+DEFINE_GETTER(Clustering, int, frozen_centroids)
+
+DEFINE_GETTER(Clustering, int, min_points_per_centroid)
+DEFINE_GETTER(Clustering, int, max_points_per_centroid)
+
+DEFINE_GETTER(Clustering, int, seed)
+DEFINE_GETTER(Clustering, size_t, decode_block_size)
+
+/// getter for d
+DEFINE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+DEFINE_GETTER(Clustering, size_t, k)
+
+DEFINE_GETTER(ClusteringIterationStats, float, obj)
+DEFINE_GETTER(ClusteringIterationStats, double, time)
+DEFINE_GETTER(ClusteringIterationStats, double, time_search)
+DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
+
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
+    ClusteringParameters d;
+    params->frozen_centroids = d.frozen_centroids;
+    params->max_points_per_centroid = d.max_points_per_centroid;
+    params->min_points_per_centroid = d.min_points_per_centroid;
+    params->niter = d.niter;
+    params->nredo = d.nredo;
+    params->seed = d.seed;
+    params->spherical = d.spherical;
+    params->int_centroids = d.int_centroids;
+    params->update_index = d.update_index;
+    params->verbose = d.verbose;
+    params->decode_block_size = d.decode_block_size;
+}
+
+// This conversion is required because the two types are not memory-compatible
+inline ClusteringParameters from_faiss_c(
+        const FaissClusteringParameters* params) {
+    ClusteringParameters o;
+    o.frozen_centroids = params->frozen_centroids;
+    o.max_points_per_centroid = params->max_points_per_centroid;
+    o.min_points_per_centroid = params->min_points_per_centroid;
+    o.niter = params->niter;
+    o.nredo = params->nredo;
+    o.seed = params->seed;
+    o.spherical = params->spherical;
+    o.update_index = params->update_index;
+    o.int_centroids = params->int_centroids;
+    o.verbose = params->verbose;
+    o.decode_block_size = params->decode_block_size;
+    return o;
+}
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+        FaissClustering* clustering,
+        float** centroids,
+        size_t* size) {
+    std::vector<float>& v =
+            reinterpret_cast<Clustering*>(clustering)->centroids;
+    if (centroids) {
+        *centroids = v.data();
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+        FaissClustering* clustering,
+        FaissClusteringIterationStats** iteration_stats,
+        size_t* size) {
+    std::vector<ClusteringIterationStats>& v =
+            reinterpret_cast<Clustering*>(clustering)->iteration_stats;
+    if (iteration_stats) {
+        *iteration_stats =
+                reinterpret_cast<FaissClusteringIterationStats*>(v.data());
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k) {
+    try {
+        Clustering* c = new Clustering(d, k);
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_Clustering_new_with_params(
+        FaissClustering** p_clustering,
+        int d,
+        int k,
+        const FaissClusteringParameters* cp) {
+    try {
+        Clustering* c = new Clustering(d, k, from_faiss_c(cp));
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+/// Index is used during the assignment stage
+int faiss_Clustering_train(
+        FaissClustering* clustering,
+        idx_t n,
+        const float* x,
+        FaissIndex* index) {
+    try {
+        reinterpret_cast<Clustering*>(clustering)
+                ->train(n, x, *reinterpret_cast<Index*>(index));
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_Clustering_free(FaissClustering* clustering) {
+    delete reinterpret_cast<Clustering*>(clustering);
+}
+
+int faiss_kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids,
+        float* q_error) {
+    try {
+        float out = faiss::kmeans_clustering(d, n, k, x, centroids);
+        if (q_error) {
+            *q_error = out;
+        }
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.h
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/Clustering_c.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c -*-
+
+#ifndef FAISS_CLUSTERING_C_H
+#define FAISS_CLUSTERING_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+typedef struct FaissClusteringParameters {
+    int niter; ///< clustering iterations
+    int nredo; ///< redo clustering this many times and keep best
+
+    int verbose;          ///< (bool)
+    int spherical;        ///< (bool) do we want normalized centroids?
+    int int_centroids;    ///< (bool) round centroids coordinates to integer
+    int update_index;     ///< (bool) update index after each iteration?
+    int frozen_centroids; ///< (bool) use the centroids provided as input and do
+                          ///< not change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid; ///< to limit size of dataset
+
+    int seed;                 ///< seed for the random number generator
+    size_t decode_block_size; ///< how many vectors at a time to decode
+} FaissClusteringParameters;
+
+/// Sets the ClusteringParameters object with reasonable defaults
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
+
+/** clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centroids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ * To do several clusterings, just call train() several times on
+ * different training sets, clearing the centroid table in between.
+ */
+FAISS_DECLARE_CLASS(Clustering)
+
+FAISS_DECLARE_GETTER(Clustering, int, niter)
+FAISS_DECLARE_GETTER(Clustering, int, nredo)
+FAISS_DECLARE_GETTER(Clustering, int, verbose)
+FAISS_DECLARE_GETTER(Clustering, int, spherical)
+FAISS_DECLARE_GETTER(Clustering, int, int_centroids)
+FAISS_DECLARE_GETTER(Clustering, int, update_index)
+FAISS_DECLARE_GETTER(Clustering, int, frozen_centroids)
+
+FAISS_DECLARE_GETTER(Clustering, int, min_points_per_centroid)
+FAISS_DECLARE_GETTER(Clustering, int, max_points_per_centroid)
+
+FAISS_DECLARE_GETTER(Clustering, int, seed)
+FAISS_DECLARE_GETTER(Clustering, size_t, decode_block_size)
+
+/// getter for d
+FAISS_DECLARE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+FAISS_DECLARE_GETTER(Clustering, size_t, k)
+
+FAISS_DECLARE_CLASS(ClusteringIterationStats)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+        FaissClustering* clustering,
+        float** centroids,
+        size_t* size);
+
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+        FaissClustering* clustering,
+        FaissClusteringIterationStats** iteration_stats,
+        size_t* size);
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
+
+int faiss_Clustering_new_with_params(
+        FaissClustering** p_clustering,
+        int d,
+        int k,
+        const FaissClusteringParameters* cp);
+
+int faiss_Clustering_train(
+        FaissClustering* clustering,
+        idx_t n,
+        const float* x,
+        FaissIndex* index);
+
+void faiss_Clustering_free(FaissClustering* clustering);
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @param q_error final quantization error
+ * @return error code
+ */
+int faiss_kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids,
+        float* q_error);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/INSTALL.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/INSTALL.md
@@ -0,0 +1,104 @@
+Faiss C API
+===========
+
+Faiss provides a pure C interface, which can subsequently be used either in pure C programs or to produce bindings for programming languages with Foreign Function Interface (FFI) support. Although this is not required for the Python interface, some other programming languages (e.g. Rust and Julia) do not have SWIG support.
+
+Compilation instructions
+------------------------
+
+The full contents of the pure C API are in the ["c_api"](c_api/) folder.
+Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
+Include `-DFAISS_ENABLE_C_API=ON` to the cmake command.
+
+`make -C build`
+
+
+This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. 
+
+To build the example program, you should run `make -C build example_c` at the top level of
+the faiss repo. The example program will be in `build/c_api/example_c` .
+
+Using the API
+-------------
+
+The C API is composed of:
+
+- A set of C header files comprising the main Faiss interfaces, converted for use in C. Each file follows the format `«name»_c.h`, where `«name»` is the respective name from the C++ API. For example, the file [Index_c.h](./Index_c.h) file corresponds to the base `Index` API. Functions are declared with the `faiss_` prefix (e.g. `faiss_IndexFlat_new`), whereas new types have the `Faiss` prefix (e.g. `FaissIndex`, `FaissMetricType`, ...).
+- A dynamic library, compiled from the sources in the same folder, encloses the implementation of the library and wrapper functions.
+
+The index factory is available via the `faiss_index_factory` function in `AutoTune_c.h`:
+
+```c
+FaissIndex* index = NULL;
+int c = faiss_index_factory(&index, 64, "Flat", METRIC_L2);
+if (c) {
+    // operation failed
+}
+```
+
+Most operations that you would find as member functions are available with the format `faiss_«classname»_«member»`.
+
+```c
+idx_t ntotal = faiss_Index_ntotal(index);
+```
+
+Since this is C, the index needs to be freed manually in the end:
+
+```c
+faiss_Index_free(index);
+```
+
+Error handling is done by examining the error code returned by operations with recoverable errors.
+The code identifies the type of exception that rose from the implementation. Fetching the 
+corresponding error message can be done by calling the function `faiss_get_last_error()` from
+`error_c.h`. Getter functions and `free` functions do not return an error code.
+
+```c
+int c = faiss_Index_add(index, nb, xb);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+An example is included, which is built automatically for the target `all`. It can also be built separately:
+
+  `make bin/example_c`
+
+Building with GPU support
+-------------------------
+
+For GPU support, a separate dynamic library in the "c_api/gpu" directory needs to be built.
+
+  `make`
+
+The "gpufaiss_c" dynamic library contains the GPU and CPU implementations of Faiss, which means that
+it can be used in place of "faiss_c". The same library will dynamically link with the CUDA runtime
+and cuBLAS.
+
+Using the GPU with the C API
+----------------------------
+
+A standard GPU resources object can be obtained by the name `FaissStandardGpuResources`:
+
+```c
+FaissStandardGpuResources* gpu_res = NULL;
+int c = faiss_StandardGpuResources_new(&gpu_res);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+Similarly to the C++ API, a CPU index can be converted to a GPU index:
+
+```c
+FaissIndex* cpu_index = NULL;
+int c = faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2);
+if (c) { /* ... */ }
+FaissGpuIndex* gpu_index = NULL;
+c = faiss_index_cpu_to_gpu(gpu_res, 0, cpu_index, &gpu_index);
+if (c) { /* ... */ }
+```
+
+A more complete example is available by the name `bin/example_gpu_c`.
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "IndexBinary_c.h"
+#include <faiss/IndexBinary.h>
+#include "macros_impl.h"
+
+extern "C" {
+
+DEFINE_DESTRUCTOR(IndexBinary)
+
+DEFINE_GETTER(IndexBinary, int, d)
+
+DEFINE_GETTER(IndexBinary, int, is_trained)
+
+DEFINE_GETTER(IndexBinary, idx_t, ntotal)
+
+DEFINE_GETTER(IndexBinary, FaissMetricType, metric_type)
+
+DEFINE_GETTER(IndexBinary, int, verbose);
+DEFINE_SETTER(IndexBinary, int, verbose);
+
+int faiss_IndexBinary_train(
+        FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x) {
+    try {
+        reinterpret_cast<faiss::IndexBinary*>(index)->train(n, x);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x) {
+    try {
+        reinterpret_cast<faiss::IndexBinary*>(index)->add(n, x);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_add_with_ids(
+        FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        const idx_t* xids) {
+    try {
+        reinterpret_cast<faiss::IndexBinary*>(index)->add_with_ids(n, x, xids);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_search(
+        const FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) {
+    try {
+        reinterpret_cast<const faiss::IndexBinary*>(index)->search(
+                n, x, k, distances, labels);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_range_search(
+        const FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        FaissRangeSearchResult* result) {
+    try {
+        reinterpret_cast<const faiss::IndexBinary*>(index)->range_search(
+                n,
+                x,
+                radius,
+                reinterpret_cast<faiss::RangeSearchResult*>(result));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_assign(
+        FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        idx_t* labels,
+        idx_t k) {
+    try {
+        reinterpret_cast<faiss::IndexBinary*>(index)->assign(n, x, labels, k);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_reset(FaissIndexBinary* index) {
+    try {
+        reinterpret_cast<faiss::IndexBinary*>(index)->reset();
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_remove_ids(
+        FaissIndexBinary* index,
+        const FaissIDSelector* sel,
+        size_t* n_removed) {
+    try {
+        size_t n{reinterpret_cast<faiss::IndexBinary*>(index)->remove_ids(
+                *reinterpret_cast<const faiss::IDSelector*>(sel))};
+        if (n_removed) {
+            *n_removed = n;
+        }
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_reconstruct(
+        const FaissIndexBinary* index,
+        idx_t key,
+        uint8_t* recons) {
+    try {
+        reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct(
+                key, recons);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexBinary_reconstruct_n(
+        const FaissIndexBinary* index,
+        idx_t i0,
+        idx_t ni,
+        uint8_t* recons) {
+    try {
+        reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct_n(
+                i0, ni, recons);
+    }
+    CATCH_AND_HANDLE
+}
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.h
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexBinary_c.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c -*-
+
+#ifndef FAISS_INDEX_BINARY_C_H
+#define FAISS_INDEX_BINARY_C_H
+
+#include <stddef.h>
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// forward declaration required here
+FAISS_DECLARE_CLASS(RangeSearchResult)
+
+// typedef struct FaissRangeSearchResult_H FaissRangeSearchResult;
+typedef struct FaissIDSelector_H FaissIDSelector;
+
+/// Opaque type for referencing to a binary index object
+FAISS_DECLARE_CLASS(IndexBinary)
+FAISS_DECLARE_DESTRUCTOR(IndexBinary)
+
+/// Getter for d
+FAISS_DECLARE_GETTER(IndexBinary, int, d)
+
+/// Getter for is_trained
+FAISS_DECLARE_GETTER(IndexBinary, int, is_trained)
+
+/// Getter for ntotal
+FAISS_DECLARE_GETTER(IndexBinary, idx_t, ntotal)
+
+/// Getter for metric_type
+FAISS_DECLARE_GETTER(IndexBinary, FaissMetricType, metric_type)
+
+FAISS_DECLARE_GETTER_SETTER(IndexBinary, int, verbose)
+
+/** Perform training on a representative set of vectors
+ *
+ * @param index  opaque pointer to index object
+ * @param n      nb of training vectors
+ * @param x      training vectors, size n * d
+ */
+int faiss_IndexBinary_train(FaissIndexBinary* index, idx_t n, const uint8_t* x);
+
+/** Add n vectors of dimension d to the index.
+ *
+ * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+ * This function slices the input vectors in chunks smaller than
+ * blocksize_add and calls add_core.
+ * @param index  opaque pointer to index object
+ * @param x      input matrix, size n * d
+ */
+int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x);
+
+/** Same as add, but stores xids instead of sequential ids.
+ *
+ * The default implementation fails with an assertion, as it is
+ * not supported by all indexes.
+ *
+ * @param index  opaque pointer to index object
+ * @param xids   if non-null, ids to store for the vectors (size n)
+ */
+int faiss_IndexBinary_add_with_ids(
+        FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        const idx_t* xids);
+
+/** query n vectors of dimension d to the index.
+ *
+ * return at most k vectors. If there are not enough results for a
+ * query, the result array is padded with -1s.
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param labels      output labels of the NNs, size n*k
+ * @param distances   output pairwise distances, size n*k
+ */
+int faiss_IndexBinary_search(
+        const FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels);
+
+/** query n vectors of dimension d to the index.
+ *
+ * return all vectors with distance < radius. Note that many
+ * indexes do not implement the range_search (only the k-NN search
+ * is mandatory).
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param radius      search radius
+ * @param result      result table
+ */
+int faiss_IndexBinary_range_search(
+        const FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        FaissRangeSearchResult* result);
+
+/** return the indexes of the k vectors closest to the query x.
+ *
+ * This function is identical as search but only return labels of neighbors.
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param labels      output labels of the NNs, size n*k
+ */
+int faiss_IndexBinary_assign(
+        FaissIndexBinary* index,
+        idx_t n,
+        const uint8_t* x,
+        idx_t* labels,
+        idx_t k);
+
+/** removes all elements from the database.
+ * @param index       opaque pointer to index object
+ */
+int faiss_IndexBinary_reset(FaissIndexBinary* index);
+
+/** removes IDs from the index. Not supported by all indexes
+ * @param index       opaque pointer to index object
+ * @param nremove     output for the number of IDs removed
+ */
+int faiss_IndexBinary_remove_ids(
+        FaissIndexBinary* index,
+        const FaissIDSelector* sel,
+        size_t* n_removed);
+
+/** Reconstruct a stored vector (or an approximation if lossy coding)
+ *
+ * this function may not be defined for some indexes
+ * @param index       opaque pointer to index object
+ * @param key         id of the vector to reconstruct
+ * @param recons      reconstructed vector (size d)
+ */
+int faiss_IndexBinary_reconstruct(
+        const FaissIndexBinary* index,
+        idx_t key,
+        uint8_t* recons);
+
+/** Reconstruct vectors i0 to i0 + ni - 1
+ *
+ * this function may not be defined for some indexes
+ * @param index       opaque pointer to index object
+ * @param recons      reconstructed vector (size ni * d)
+ */
+int faiss_IndexBinary_reconstruct_n(
+        const FaissIndexBinary* index,
+        idx_t i0,
+        idx_t ni,
+        uint8_t* recons);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "IndexFlat_c.h"
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexRefine.h>
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Index;
+using faiss::IndexFlat;
+using faiss::IndexFlat1D;
+using faiss::IndexFlatIP;
+using faiss::IndexFlatL2;
+using faiss::IndexRefineFlat;
+
+DEFINE_DESTRUCTOR(IndexFlat)
+DEFINE_INDEX_DOWNCAST(IndexFlat)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexFlat*>(new IndexFlat());
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat_new_with(
+        FaissIndexFlat** p_index,
+        idx_t d,
+        FaissMetricType metric) {
+    try {
+        IndexFlat* index =
+                new IndexFlat(d, static_cast<faiss::MetricType>(metric));
+        *p_index = reinterpret_cast<FaissIndexFlat*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size) {
+    IndexFlat* indexf = reinterpret_cast<IndexFlat*>(index);
+    *p_xb = indexf->get_xb();
+    if (p_size) {
+        *p_size = indexf->codes.size() / sizeof(float);
+    }
+}
+
+int faiss_IndexFlat_compute_distance_subset(
+        FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        const idx_t* labels) {
+    try {
+        reinterpret_cast<IndexFlat*>(index)->compute_distance_subset(
+                n, x, k, distances, labels);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexFlatIP)
+DEFINE_INDEX_DOWNCAST(IndexFlatIP)
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP();
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP(d);
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexFlatL2)
+DEFINE_INDEX_DOWNCAST(IndexFlatL2)
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2();
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2(d);
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexRefineFlat_new(
+        FaissIndexRefineFlat** p_index,
+        FaissIndex* base_index) {
+    try {
+        IndexRefineFlat* index = new IndexRefineFlat(
+                reinterpret_cast<faiss::Index*>(base_index));
+        *p_index = reinterpret_cast<FaissIndexRefineFlat*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexRefineFlat)
+DEFINE_INDEX_DOWNCAST(IndexRefineFlat)
+
+DEFINE_GETTER(IndexRefineFlat, int, own_fields)
+DEFINE_SETTER(IndexRefineFlat, int, own_fields)
+
+DEFINE_GETTER(IndexRefineFlat, float, k_factor)
+DEFINE_SETTER(IndexRefineFlat, float, k_factor)
+
+DEFINE_DESTRUCTOR(IndexFlat1D)
+DEFINE_INDEX_DOWNCAST(IndexFlat1D)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index) {
+    try {
+        IndexFlat1D* index = new IndexFlat1D();
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_new_with(
+        FaissIndexFlat1D** p_index,
+        int continuous_update) {
+    try {
+        IndexFlat1D* index =
+                new IndexFlat1D(static_cast<bool>(continuous_update));
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index) {
+    try {
+        reinterpret_cast<IndexFlat1D*>(index)->update_permutation();
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+}
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.h
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexFlat_c.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c -*-
+
+#ifndef FAISS_INDEX_FLAT_C_H
+#define FAISS_INDEX_FLAT_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// forward declaration
+typedef enum FaissMetricType FaissMetricType;
+
+/** Opaque type for IndexFlat */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat, Index)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index);
+
+int faiss_IndexFlat_new_with(
+        FaissIndexFlat** p_index,
+        idx_t d,
+        FaissMetricType metric);
+
+/** get a pointer to the index's internal data (the `xb` field). The outputs
+ * become invalid after any data addition or removal operation.
+ *
+ * @param index   opaque pointer to index object
+ * @param p_xb    output, the pointer to the beginning of `xb`.
+ * @param p_size  output, the current size of `sb` in number of float values.
+ */
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size);
+
+/** attempt a dynamic cast to a flat index, thus checking
+ * check whether the underlying index type is `IndexFlat`.
+ *
+ * @param index opaque pointer to index object
+ * @return the same pointer if the index is a flat index, NULL otherwise
+ */
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat)
+
+FAISS_DECLARE_DESTRUCTOR(IndexFlat)
+
+/** compute distance with a subset of vectors
+ *
+ * @param index   opaque pointer to index object
+ * @param x       query vectors, size n * d
+ * @param labels  indices of the vectors that should be compared
+ *                for each query vector, size n * k
+ * @param distances
+ *                corresponding output distances, size n * k
+ */
+int faiss_IndexFlat_compute_distance_subset(
+        FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        const idx_t* labels);
+
+/** Opaque type for IndexFlatIP */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatIP, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatIP)
+FAISS_DECLARE_DESTRUCTOR(IndexFlatIP)
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index);
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d);
+
+/** Opaque type for IndexFlatL2 */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatL2)
+FAISS_DECLARE_DESTRUCTOR(IndexFlatL2)
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index);
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d);
+
+/** Opaque type for IndexRefineFlat
+ *
+ * Index that queries in a base_index (a fast one) and refines the
+ * results with an exact search, hopefully improving the results.
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexRefineFlat, Index)
+
+int faiss_IndexRefineFlat_new(
+        FaissIndexRefineFlat** p_index,
+        FaissIndex* base_index);
+
+FAISS_DECLARE_DESTRUCTOR(IndexRefineFlat)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexRefineFlat)
+
+FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, int, own_fields)
+
+/// factor between k requested in search and the k requested from
+/// the base_index (should be >= 1)
+FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, float, k_factor)
+
+/** Opaque type for IndexFlat1D
+ *
+ * optimized version for 1D "vectors"
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat1D, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat1D)
+FAISS_DECLARE_DESTRUCTOR(IndexFlat1D)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index);
+int faiss_IndexFlat1D_new_with(
+        FaissIndexFlat1D** p_index,
+        int continuous_update);
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.cpp
+++ b/packages/leann-backend-hnsw/third_party/faiss/c_api/IndexIVFFlat_c.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "IndexIVFFlat_c.h"
+#include <faiss/IndexIVFFlat.h>
+#include "Clustering_c.h"
+#include "Index_c.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexIVFFlat;
+using faiss::MetricType;
+
+DEFINE_DESTRUCTOR(IndexIVFFlat)
+DEFINE_INDEX_DOWNCAST(IndexIVFFlat)
+
+/// number of possible key values
+DEFINE_GETTER(IndexIVFFlat, size_t, nlist)
+/// number of probes at query time
+DEFINE_GETTER(IndexIVFFlat, size_t, nprobe)
+DEFINE_SETTER(IndexIVFFlat, size_t, nprobe)
+
+/// quantizer that maps vectors to inverted lists
+DEFINE_GETTER_PERMISSIVE(IndexIVFFlat, FaissIndex*, quantizer)
+
+/**
+ * = 0: use the quantizer as index in a kmeans training
+ * = 1: just pass on the training set to the train() of the quantizer
+ * = 2: kmeans training on a flat index + add the centroids to the quantizer
+ */
+DEFINE_GETTER(IndexIVFFlat, char, quantizer_trains_alone)
+
+/// whether object owns the quantizer
+DEFINE_GETTER(IndexIVFFlat, int, own_fields)
+DEFINE_SETTER(IndexIVFFlat, int, own_fields)
+
+int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat());
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with(
+        FaissIndexIVFFlat** p_index,
+        FaissIndex* quantizer,
+        size_t d,
+        size_t nlist) {
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
+                new IndexIVFFlat(q, d, nlist));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with_metric(
+        FaissIndexIVFFlat** p_index,
+        FaissIndex* quantizer,
+        size_t d,
+        size_t nlist,
+        FaissMetricType metric) {
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        auto m = static_cast<MetricType>(metric);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
+                new IndexIVFFlat(q, d, nlist, m));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_add_core(
+        FaissIndexIVFFlat* index,
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const int64_t* precomputed_idx) {
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->add_core(
+                n, x, xids, precomputed_idx);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_update_vectors(
+        FaissIndexIVFFlat* index,
+        int nv,
+        idx_t* idx,
+        const float* v) {
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->update_vectors(nv, idx, v);
+    }
+    CATCH_AND_HANDLE
+}
--- a/Show More
+++ b/Show More