perf: much faster loading and embedding serving

2025-07-22 19:38:22 -07:00
parent 2a96d05b21
commit d3f85678ec
3 changed files with 100 additions and 25 deletions
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -421,9 +421,9 @@ class LeannSearcher:
        logger.info(f"  Top_k: {top_k}")
        logger.info(f"  Additional kwargs: {kwargs}")
        start_time = time.time()
        zmq_port = None
        start_time = time.time()
        if recompute_embeddings:
            zmq_port = self.backend_impl._ensure_server_running(
                self.meta_path_str,
@@ -431,6 +431,10 @@ class LeannSearcher:
                **kwargs,
            )
            del expected_zmq_port
        zmq_time = time.time() - start_time
        logger.info(f"  Launching server time: {zmq_time} seconds")
        start_time = time.time()
        query_embedding = self.backend_impl.compute_query_embedding(
            query,
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -25,6 +25,8 @@ def compute_embeddings(
    model_name: str,
    mode: str = "sentence-transformers",
    is_build: bool = False,
    batch_size: int = 32,
    adaptive_optimization: bool = True,
 ) -> np.ndarray:
    """
    Unified embedding computation entry point
@@ -33,13 +35,20 @@ def compute_embeddings(
        texts: List of texts to compute embeddings for
        model_name: Model name
        mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
        is_build: Whether this is a build operation (shows progress bar)
        batch_size: Batch size for processing
        adaptive_optimization: Whether to use adaptive optimization based on batch size
    Returns:
        Normalized embeddings array, shape: (len(texts), embedding_dim)
    """
    if mode == "sentence-transformers":
        return compute_embeddings_sentence_transformers(
-            texts, model_name, is_build=is_build
+            texts,
            model_name,
            is_build=is_build,
            batch_size=batch_size,
            adaptive_optimization=adaptive_optimization,
        )
    elif mode == "openai":
        return compute_embeddings_openai(texts, model_name)
@@ -56,9 +65,19 @@ def compute_embeddings_sentence_transformers(
    device: str = "auto",
    batch_size: int = 32,
    is_build: bool = False,
    adaptive_optimization: bool = True,
 ) -> np.ndarray:
    """
-    Compute embeddings using SentenceTransformer with model caching
+    Compute embeddings using SentenceTransformer with model caching and adaptive optimization
    Args:
        texts: List of texts to compute embeddings for
        model_name: Model name
        use_fp16: Whether to use FP16 precision
        device: Device to use ('auto', 'cuda', 'mps', 'cpu')
        batch_size: Batch size for processing
        is_build: Whether this is a build operation (shows progress bar)
        adaptive_optimization: Whether to use adaptive optimization based on batch size
    """
    # Handle empty input
    if not texts:
@@ -76,28 +95,68 @@ def compute_embeddings_sentence_transformers(
        else:
            device = "cpu"
    # Apply optimizations based on benchmark results
    if adaptive_optimization:
        # Use optimal batch_size constants for different devices based on benchmark results
        if device == "mps":
            batch_size = 128  # MPS optimal batch size from benchmark
            if model_name == "Qwen/Qwen3-Embedding-0.6B":
                batch_size = 64
        elif device == "cuda":
            batch_size = 256  # CUDA optimal batch size
        # Keep original batch_size for CPU
    # Create cache key
-    cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}"
+    cache_key = f"sentence_transformers_{model_name}_{device}_{use_fp16}_optimized"
    # Check if model is already cached
    if cache_key in _model_cache:
-        logger.info(f"Using cached model: {model_name}")
+        logger.info(f"Using cached optimized model: {model_name}")
        model = _model_cache[cache_key]
    else:
-        logger.info(f"Loading and caching SentenceTransformer model: {model_name}")
+        logger.info(
            f"Loading and caching optimized SentenceTransformer model: {model_name}"
        )
        from sentence_transformers import SentenceTransformer
        logger.info(f"Using device: {device}")
-        # Prepare model and tokenizer optimization parameters
+        # Apply hardware optimizations
        if device == "cuda":
            # TODO: Haven't tested this yet
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False
            torch.cuda.set_per_process_memory_fraction(0.9)
        elif device == "mps":
            try:
                if hasattr(torch.mps, "set_per_process_memory_fraction"):
                    torch.mps.set_per_process_memory_fraction(0.9)
            except AttributeError:
                logger.warning(
                    "Some MPS optimizations not available in this PyTorch version"
                )
        elif device == "cpu":
            # TODO: Haven't tested this yet
            torch.set_num_threads(min(8, os.cpu_count() or 4))
            try:
                torch.backends.mkldnn.enabled = True
            except AttributeError:
                pass
        # Prepare optimized model and tokenizer parameters
        model_kwargs = {
            "torch_dtype": torch.float16 if use_fp16 else torch.float32,
            "low_cpu_mem_usage": True,
            "_fast_init": True,
            "attn_implementation": "eager",  # Use eager attention for speed
        }
        tokenizer_kwargs = {
            "use_fast": True,
            "padding": True,
            "truncation": True,
        }
        try:
@@ -128,32 +187,44 @@ def compute_embeddings_sentence_transformers(
            )
            logger.info("Model loaded successfully! (network + optimized)")
-        # Apply additional optimizations (if supported)
+        # Apply additional optimizations based on mode
        if use_fp16 and device in ["cuda", "mps"]:
            try:
                model = model.half()
-                model = torch.compile(model)
+                logger.info(f"Applied FP16 precision: {model_name}")
                logger.info(
                    f"Using FP16 precision and compile optimization: {model_name}"
                )
            except Exception as e:
-                logger.warning(f"FP16 or compile optimization failed: {e}")
+                logger.warning(f"FP16 optimization failed: {e}")
        # Apply torch.compile optimization
        if device in ["cuda", "mps"]:
            try:
                model = torch.compile(model, mode="reduce-overhead", dynamic=True)
                logger.info(f"Applied torch.compile optimization: {model_name}")
            except Exception as e:
                logger.warning(f"torch.compile optimization failed: {e}")
        # Set model to eval mode and disable gradients for inference
        model.eval()
        for param in model.parameters():
            param.requires_grad_(False)
        # Cache the model
        _model_cache[cache_key] = model
        logger.info(f"Model cached: {cache_key}")
-    # Compute embeddings
+    # Compute embeddings with optimized inference mode
-    logger.info("Starting embedding computation...")
+    logger.info(f"Starting embedding computation... (batch_size: {batch_size})")
-    embeddings = model.encode(
+    # Use torch.inference_mode for optimal performance
-        texts,
+    with torch.inference_mode():
-        batch_size=batch_size,
+        embeddings = model.encode(
-        show_progress_bar=is_build,  # Don't show progress bar in server environment
+            texts,
-        convert_to_numpy=True,
+            batch_size=batch_size,
-        normalize_embeddings=False,
+            show_progress_bar=is_build,  # Don't show progress bar in server environment
-        device=device,
+            convert_to_numpy=True,
-    )
+            normalize_embeddings=False,
            device=device,
        )
    logger.info(
        f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"