From d505dcc5e3e65b0caceb2e8244e924eb11975011 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 28 Jul 2025 14:35:49 -0700 Subject: [PATCH] Fix/OpenAI embeddings cosine distance (#10) * fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric. * style: format * feat: add OpenAI embeddings support to google_history_reader_leann.py - Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings * feat: add --use-existing-index option to google_history_reader_leann.py - Allow using existing index without rebuilding - Useful for testing pre-built indices * fix: Improve OpenAI embeddings handling in HNSW backend --- examples/google_history_reader_leann.py | 76 ++++++++++++++----- .../leann_backend_hnsw/hnsw_backend.py | 14 +++- .../src/leann/embedding_server_manager.py | 2 + .../leann-core/src/leann/searcher_base.py | 9 ++- 4 files changed, 80 insertions(+), 21 deletions(-) diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py index 82d78b1..62781a4 100644 --- a/examples/google_history_reader_leann.py +++ b/examples/google_history_reader_leann.py @@ -24,6 +24,8 @@ def create_leann_index_from_multiple_chrome_profiles( profile_dirs: list[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1, + embedding_model: str = "facebook/contriever", + embedding_mode: str = "sentence-transformers", ): """ Create LEANN index from multiple Chrome profile data sources. @@ -32,6 +34,8 @@ def create_leann_index_from_multiple_chrome_profiles( profile_dirs: List of Path objects pointing to Chrome profile directories index_path: Path to save the LEANN index max_count: Maximum number of history entries to process per profile + embedding_model: The embedding model to use + embedding_mode: The embedding backend mode """ print("Creating LEANN index from multiple Chrome profile data sources...") @@ -106,9 +110,11 @@ def create_leann_index_from_multiple_chrome_profiles( print("\n[PHASE 1] Building Leann index...") # Use HNSW backend for better macOS compatibility + # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=embedding_model, + embedding_mode=embedding_mode, graph_degree=32, complexity=64, is_compact=True, @@ -132,6 +138,8 @@ def create_leann_index( profile_path: str | None = None, index_path: str = "chrome_history_index.leann", max_count: int = 1000, + embedding_model: str = "facebook/contriever", + embedding_mode: str = "sentence-transformers", ): """ Create LEANN index from Chrome history data. @@ -140,6 +148,8 @@ def create_leann_index( profile_path: Path to the Chrome profile directory (optional, uses default if None) index_path: Path to save the LEANN index max_count: Maximum number of history entries to process + embedding_model: The embedding model to use + embedding_mode: The embedding backend mode """ print("Creating LEANN index from Chrome history data...") INDEX_DIR = Path(index_path).parent @@ -187,9 +197,11 @@ def create_leann_index( print("\n[PHASE 1] Building Leann index...") # Use HNSW backend for better macOS compatibility + # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric builder = LeannBuilder( backend_name="hnsw", - embedding_model="facebook/contriever", + embedding_model=embedding_model, + embedding_mode=embedding_mode, graph_degree=32, complexity=64, is_compact=True, @@ -273,6 +285,24 @@ async def main(): default=True, help="Automatically find all Chrome profiles (default: True)", ) + parser.add_argument( + "--embedding-model", + type=str, + default="facebook/contriever", + help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')", + ) + parser.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx"], + help="The embedding backend mode", + ) + parser.add_argument( + "--use-existing-index", + action="store_true", + help="Use existing index without rebuilding", + ) args = parser.parse_args() @@ -283,26 +313,34 @@ async def main(): print(f"Index directory: {INDEX_DIR}") print(f"Max entries: {args.max_entries}") - # Find Chrome profile directories - from history_data.history import ChromeHistoryReader - - if args.auto_find_profiles: - profile_dirs = ChromeHistoryReader.find_chrome_profiles() - if not profile_dirs: - print("No Chrome profiles found automatically. Exiting.") + if args.use_existing_index: + # Use existing index without rebuilding + if not Path(INDEX_PATH).exists(): + print(f"Error: Index file not found at {INDEX_PATH}") return + print(f"Using existing index at {INDEX_PATH}") + index_path = INDEX_PATH else: - # Use single specified profile - profile_path = Path(args.chrome_profile) - if not profile_path.exists(): - print(f"Chrome profile not found: {profile_path}") - return - profile_dirs = [profile_path] + # Find Chrome profile directories + from history_data.history import ChromeHistoryReader - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_chrome_profiles( - profile_dirs, INDEX_PATH, args.max_entries - ) + if args.auto_find_profiles: + profile_dirs = ChromeHistoryReader.find_chrome_profiles() + if not profile_dirs: + print("No Chrome profiles found automatically. Exiting.") + return + else: + # Use single specified profile + profile_path = Path(args.chrome_profile) + if not profile_path.exists(): + print(f"Chrome profile not found: {profile_path}") + return + profile_dirs = [profile_path] + + # Create or load the LEANN index from all sources + index_path = create_leann_index_from_multiple_chrome_profiles( + profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode + ) if index_path: if args.query: diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py index e1afb36..d6b87f9 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py @@ -124,7 +124,9 @@ class HNSWSearcher(BaseSearcher): ) from . import faiss # type: ignore - self.distance_metric = self.meta.get("distance_metric", "mips").lower() + self.distance_metric = ( + self.meta.get("backend_kwargs", {}).get("distance_metric", "mips").lower() + ) metric_enum = get_metric_map().get(self.distance_metric) if metric_enum is None: raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.") @@ -200,6 +202,16 @@ class HNSWSearcher(BaseSearcher): params.efSearch = complexity params.beam_size = beam_width + # For OpenAI embeddings with cosine distance, disable relative distance check + # This prevents early termination when all scores are in a narrow range + embedding_model = self.meta.get("embedding_model", "").lower() + if self.distance_metric == "cosine" and any( + openai_model in embedding_model for openai_model in ["text-embedding", "openai"] + ): + params.check_relative_distance = False + else: + params.check_relative_distance = True + # PQ pruning: direct mapping to HNSW's pq_pruning_ratio params.pq_pruning_ratio = prune_ratio diff --git a/packages/leann-core/src/leann/embedding_server_manager.py b/packages/leann-core/src/leann/embedding_server_manager.py index 3c8a028..5a75ac7 100644 --- a/packages/leann-core/src/leann/embedding_server_manager.py +++ b/packages/leann-core/src/leann/embedding_server_manager.py @@ -293,6 +293,8 @@ class EmbeddingServerManager: command.extend(["--passages-file", str(passages_file)]) if embedding_mode != "sentence-transformers": command.extend(["--embedding-mode", embedding_mode]) + if kwargs.get("distance_metric"): + command.extend(["--distance-metric", kwargs["distance_metric"]]) return command diff --git a/packages/leann-core/src/leann/searcher_base.py b/packages/leann-core/src/leann/searcher_base.py index cc02fb2..02ec430 100644 --- a/packages/leann-core/src/leann/searcher_base.py +++ b/packages/leann-core/src/leann/searcher_base.py @@ -63,12 +63,19 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC): if not self.embedding_model: raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.") + # Get distance_metric from meta if not provided in kwargs + distance_metric = ( + kwargs.get("distance_metric") + or self.meta.get("backend_kwargs", {}).get("distance_metric") + or "mips" + ) + server_started, actual_port = self.embedding_server_manager.start_server( port=port, model_name=self.embedding_model, embedding_mode=self.embedding_mode, passages_file=passages_source_file, - distance_metric=kwargs.get("distance_metric"), + distance_metric=distance_metric, enable_warmup=kwargs.get("enable_warmup", False), ) if not server_started: