From 9a5c197acdfbbf9fe5d5099345affe74f866c113 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Jul 2025 20:21:05 -0700 Subject: [PATCH] fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric. --- docs/normalized_embeddings.md | 75 +++++++++++++++++++ examples/main_cli_example.py | 24 +++++- .../leann_backend_diskann/diskann_backend.py | 59 +++++++++++---- .../leann_backend_hnsw/hnsw_backend.py | 10 ++- packages/leann-core/src/leann/api.py | 71 ++++++++++++++++++ uv.lock | 10 +-- 6 files changed, 223 insertions(+), 26 deletions(-) create mode 100644 docs/normalized_embeddings.md diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md new file mode 100644 index 0000000..d6f285e --- /dev/null +++ b/docs/normalized_embeddings.md @@ -0,0 +1,75 @@ +# Normalized Embeddings Support in LEANN + +LEANN now automatically detects normalized embedding models and sets the appropriate distance metric for optimal performance. + +## What are Normalized Embeddings? + +Normalized embeddings are vectors with L2 norm = 1 (unit vectors). These embeddings are optimized for cosine similarity rather than Maximum Inner Product Search (MIPS). + +## Automatic Detection + +When you create a `LeannBuilder` instance with a normalized embedding model, LEANN will: + +1. **Automatically set `distance_metric="cosine"`** if not specified +2. **Show a warning** if you manually specify a different distance metric +3. **Provide optimal search performance** with the correct metric + +## Supported Normalized Embedding Models + +### OpenAI +All OpenAI text embedding models are normalized: +- `text-embedding-ada-002` +- `text-embedding-3-small` +- `text-embedding-3-large` + +### Voyage AI +All Voyage AI embedding models are normalized: +- `voyage-2` +- `voyage-3` +- `voyage-large-2` +- `voyage-multilingual-2` +- `voyage-code-2` + +### Cohere +All Cohere embedding models are normalized: +- `embed-english-v3.0` +- `embed-multilingual-v3.0` +- `embed-english-light-v3.0` +- `embed-multilingual-light-v3.0` + +## Example Usage + +```python +from leann.api import LeannBuilder + +# Automatic detection - will use cosine distance +builder = LeannBuilder( + backend_name="hnsw", + embedding_model="text-embedding-3-small", + embedding_mode="openai" +) +# Warning: Detected normalized embeddings model 'text-embedding-3-small'... +# Automatically setting distance_metric='cosine' + +# Manual override (not recommended) +builder = LeannBuilder( + backend_name="hnsw", + embedding_model="text-embedding-3-small", + embedding_mode="openai", + distance_metric="mips" # Will show warning +) +# Warning: Using 'mips' distance metric with normalized embeddings... +``` + +## Non-Normalized Embeddings + +Models like `facebook/contriever` and other sentence-transformers models that are not normalized will continue to use MIPS by default, which is optimal for them. + +## Why This Matters + +Using the wrong distance metric with normalized embeddings can lead to: +- **Poor search quality** due to HNSW's early termination with narrow score ranges +- **Incorrect ranking** of search results +- **Suboptimal performance** compared to using the correct metric + +For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). \ No newline at end of file diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py index adf0261..502821c 100644 --- a/examples/main_cli_example.py +++ b/examples/main_cli_example.py @@ -30,17 +30,22 @@ async def main(args): all_texts = [] for doc in documents: nodes = node_parser.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) + if nodes: + all_texts.extend(node.get_content() for node in nodes) print("--- Index directory not found, building new index ---") print("\n[PHASE 1] Building Leann index...") + # LeannBuilder now automatically detects normalized embeddings and sets appropriate distance metric + print(f"Using {args.embedding_model} with {args.embedding_mode} mode") + # Use HNSW backend for better macOS compatibility builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + # distance_metric is automatically set based on embedding model graph_degree=32, complexity=64, is_compact=True, @@ -89,6 +94,19 @@ if __name__ == "__main__": default="Qwen/Qwen3-0.6B", help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).", ) + parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small').", + ) + parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx"], + help="The embedding backend mode.", + ) parser.add_argument( "--host", type=str, diff --git a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py index 7ca2810..b73f36f 100644 --- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py +++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py @@ -163,18 +163,44 @@ class DiskannSearcher(BaseSearcher): self.num_threads = kwargs.get("num_threads", 8) - fake_zmq_port = 6666 + # For DiskANN, we need to reinitialize the index when zmq_port changes + # Store the initialization parameters for later use full_index_prefix = str(self.index_dir / self.index_path.stem) - self._index = diskannpy.StaticDiskFloatIndex( - metric_enum, - full_index_prefix, - self.num_threads, - kwargs.get("num_nodes_to_cache", 0), - 1, - fake_zmq_port, # Initial port, can be updated at runtime - "", - "", - ) + self._init_params = { + "metric_enum": metric_enum, + "full_index_prefix": full_index_prefix, + "num_threads": self.num_threads, + "num_nodes_to_cache": kwargs.get("num_nodes_to_cache", 0), + "cache_mechanism": 1, + "pq_prefix": "", + "partition_prefix": "", + } + self._diskannpy = diskannpy + self._current_zmq_port = None + self._index = None + logger.debug("DiskANN searcher initialized (index will be loaded on first search)") + + def _ensure_index_loaded(self, zmq_port: int): + """Ensure the index is loaded with the correct zmq_port.""" + if self._index is None or self._current_zmq_port != zmq_port: + # Need to (re)load the index with the correct zmq_port + with suppress_cpp_output_if_needed(): + if self._index is not None: + logger.debug(f"Reloading DiskANN index with new zmq_port: {zmq_port}") + else: + logger.debug(f"Loading DiskANN index with zmq_port: {zmq_port}") + + self._index = self._diskannpy.StaticDiskFloatIndex( + self._init_params["metric_enum"], + self._init_params["full_index_prefix"], + self._init_params["num_threads"], + self._init_params["num_nodes_to_cache"], + self._init_params["cache_mechanism"], + zmq_port, + self._init_params["pq_prefix"], + self._init_params["partition_prefix"], + ) + self._current_zmq_port = zmq_port def search( self, @@ -212,14 +238,15 @@ class DiskannSearcher(BaseSearcher): Returns: Dict with 'labels' (list of lists) and 'distances' (ndarray) """ - # Handle zmq_port compatibility: DiskANN can now update port at runtime + # Handle zmq_port compatibility: Ensure index is loaded with correct port if recompute_embeddings: if zmq_port is None: raise ValueError("zmq_port must be provided if recompute_embeddings is True") - current_port = self._index.get_zmq_port() - if zmq_port != current_port: - logger.debug(f"Updating DiskANN zmq_port from {current_port} to {zmq_port}") - self._index.set_zmq_port(zmq_port) + self._ensure_index_loaded(zmq_port) + else: + # If not recomputing, we still need an index, use a default port + if self._index is None: + self._ensure_index_loaded(6666) # Default port when not recomputing # DiskANN doesn't support "proportional" strategy if pruning_strategy == "proportional": diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py index a6bd852..e1afb36 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py @@ -28,6 +28,12 @@ def get_metric_map(): } +def normalize_l2(data: np.ndarray) -> np.ndarray: + norms = np.linalg.norm(data, axis=1, keepdims=True) + norms[norms == 0] = 1 # Avoid division by zero + return data / norms + + @register_backend("hnsw") class HNSWBackend(LeannBackendFactoryInterface): @staticmethod @@ -76,7 +82,7 @@ class HNSWBuilder(LeannBackendBuilderInterface): index.hnsw.efConstruction = self.efConstruction if self.distance_metric.lower() == "cosine": - faiss.normalize_L2(data) + data = normalize_l2(data) index.add(data.shape[0], faiss.swig_ptr(data)) index_file = index_dir / f"{index_prefix}.index" @@ -186,7 +192,7 @@ class HNSWSearcher(BaseSearcher): if query.dtype != np.float32: query = query.astype(np.float32) if self.distance_metric == "cosine": - faiss.normalize_L2(query) + query = normalize_l2(query) params = faiss.SearchParametersHNSW() if zmq_port is not None: diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 66bb7a5..2cd37e1 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -12,6 +12,7 @@ from pathlib import Path from typing import Any, Literal import numpy as np +import warnings from leann.interface import LeannBackendSearcherInterface @@ -163,6 +164,76 @@ class LeannBuilder: self.embedding_model = embedding_model self.dimensions = dimensions self.embedding_mode = embedding_mode + + # Check if we need to use cosine distance for normalized embeddings + normalized_embeddings_models = { + # OpenAI models + ("openai", "text-embedding-ada-002"), + ("openai", "text-embedding-3-small"), + ("openai", "text-embedding-3-large"), + # Voyage AI models + ("voyage", "voyage-2"), + ("voyage", "voyage-3"), + ("voyage", "voyage-large-2"), + ("voyage", "voyage-multilingual-2"), + ("voyage", "voyage-code-2"), + # Cohere models + ("cohere", "embed-english-v3.0"), + ("cohere", "embed-multilingual-v3.0"), + ("cohere", "embed-english-light-v3.0"), + ("cohere", "embed-multilingual-light-v3.0"), + } + + # Also check for patterns in model names + is_normalized = False + current_model_lower = embedding_model.lower() + current_mode_lower = embedding_mode.lower() + + # Check exact matches + for mode, model in normalized_embeddings_models: + if (current_mode_lower == mode and current_model_lower == model) or ( + mode in current_mode_lower and model in current_model_lower + ): + is_normalized = True + break + + # Check patterns + if not is_normalized: + # OpenAI patterns + if "openai" in current_mode_lower or "openai" in current_model_lower: + if any( + pattern in current_model_lower + for pattern in ["text-embedding", "ada", "3-small", "3-large"] + ): + is_normalized = True + # Voyage patterns + elif "voyage" in current_mode_lower or "voyage" in current_model_lower: + is_normalized = True + # Cohere patterns + elif "cohere" in current_mode_lower or "cohere" in current_model_lower: + if "embed" in current_model_lower: + is_normalized = True + + # Handle distance metric + if is_normalized and "distance_metric" not in backend_kwargs: + backend_kwargs["distance_metric"] = "cosine" + warnings.warn( + f"Detected normalized embeddings model '{embedding_model}' with mode '{embedding_mode}'. " + f"Automatically setting distance_metric='cosine' for optimal performance. " + f"Normalized embeddings (L2 norm = 1) should use cosine similarity instead of MIPS.", + UserWarning, + stacklevel=2, + ) + elif is_normalized and backend_kwargs.get("distance_metric", "").lower() != "cosine": + current_metric = backend_kwargs.get("distance_metric", "mips") + warnings.warn( + f"Warning: Using '{current_metric}' distance metric with normalized embeddings model " + f"'{embedding_model}' may lead to suboptimal search results. " + f"Consider using 'cosine' distance metric for better performance.", + UserWarning, + stacklevel=2, + ) + self.backend_kwargs = backend_kwargs self.chunks: list[dict[str, Any]] = [] diff --git a/uv.lock b/uv.lock index a46abc5..0a32b65 100644 --- a/uv.lock +++ b/uv.lock @@ -1847,7 +1847,7 @@ wheels = [ [[package]] name = "leann-backend-diskann" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-backend-diskann" } dependencies = [ { name = "leann-core" }, @@ -1858,14 +1858,14 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.1.13" }, + { name = "leann-core", specifier = "==0.1.14" }, { name = "numpy" }, { name = "protobuf", specifier = ">=3.19.0" }, ] [[package]] name = "leann-backend-hnsw" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-backend-hnsw" } dependencies = [ { name = "leann-core" }, @@ -1877,7 +1877,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.1.13" }, + { name = "leann-core", specifier = "==0.1.14" }, { name = "msgpack", specifier = ">=1.0.0" }, { name = "numpy" }, { name = "pyzmq", specifier = ">=23.0.0" }, @@ -1885,7 +1885,7 @@ requires-dist = [ [[package]] name = "leann-core" -version = "0.1.13" +version = "0.1.14" source = { editable = "packages/leann-core" } dependencies = [ { name = "accelerate" },