Fix/OpenAI embeddings cosine distance (#10)

* fix: auto-detect normalized embeddings and use cosine distance

- Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere)
- Automatically set distance_metric='cosine' for normalized embeddings
- Add warnings when using non-optimal distance metrics
- Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2)
- Fix DiskANN zmq_port compatibility with lazy loading strategy
- Add documentation for normalized embeddings feature

This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.

* style: format

* feat: add OpenAI embeddings support to google_history_reader_leann.py

- Add --embedding-model and --embedding-mode arguments
- Support automatic detection of normalized embeddings
- Works correctly with cosine distance for OpenAI embeddings

* feat: add --use-existing-index option to google_history_reader_leann.py

- Allow using existing index without rebuilding
- Useful for testing pre-built indices

* fix: Improve OpenAI embeddings handling in HNSW backend
This commit is contained in:
Andy Lee
2025-07-28 14:35:49 -07:00
committed by GitHub
parent 261006c36a
commit d505dcc5e3
4 changed files with 80 additions and 21 deletions

View File

@@ -124,7 +124,9 @@ class HNSWSearcher(BaseSearcher):
)
from . import faiss # type: ignore
self.distance_metric = self.meta.get("distance_metric", "mips").lower()
self.distance_metric = (
self.meta.get("backend_kwargs", {}).get("distance_metric", "mips").lower()
)
metric_enum = get_metric_map().get(self.distance_metric)
if metric_enum is None:
raise ValueError(f"Unsupported distance_metric '{self.distance_metric}'.")
@@ -200,6 +202,16 @@ class HNSWSearcher(BaseSearcher):
params.efSearch = complexity
params.beam_size = beam_width
# For OpenAI embeddings with cosine distance, disable relative distance check
# This prevents early termination when all scores are in a narrow range
embedding_model = self.meta.get("embedding_model", "").lower()
if self.distance_metric == "cosine" and any(
openai_model in embedding_model for openai_model in ["text-embedding", "openai"]
):
params.check_relative_distance = False
else:
params.check_relative_distance = True
# PQ pruning: direct mapping to HNSW's pq_pruning_ratio
params.pq_pruning_ratio = prune_ratio

View File

@@ -293,6 +293,8 @@ class EmbeddingServerManager:
command.extend(["--passages-file", str(passages_file)])
if embedding_mode != "sentence-transformers":
command.extend(["--embedding-mode", embedding_mode])
if kwargs.get("distance_metric"):
command.extend(["--distance-metric", kwargs["distance_metric"]])
return command

View File

@@ -63,12 +63,19 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
if not self.embedding_model:
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
# Get distance_metric from meta if not provided in kwargs
distance_metric = (
kwargs.get("distance_metric")
or self.meta.get("backend_kwargs", {}).get("distance_metric")
or "mips"
)
server_started, actual_port = self.embedding_server_manager.start_server(
port=port,
model_name=self.embedding_model,
embedding_mode=self.embedding_mode,
passages_file=passages_source_file,
distance_metric=kwargs.get("distance_metric"),
distance_metric=distance_metric,
enable_warmup=kwargs.get("enable_warmup", False),
)
if not server_started: