fix: auto-detect normalized embeddings and use cosine distance

- Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere)
- Automatically set distance_metric='cosine' for normalized embeddings
- Add warnings when using non-optimal distance metrics
- Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2)
- Fix DiskANN zmq_port compatibility with lazy loading strategy
- Add documentation for normalized embeddings feature

This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.
This commit is contained in:
Andy Lee
2025-07-27 20:21:05 -07:00
parent 48207c3b69
commit 9a5c197acd
6 changed files with 223 additions and 26 deletions

10
uv.lock generated
View File

@@ -1847,7 +1847,7 @@ wheels = [
[[package]]
name = "leann-backend-diskann"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-backend-diskann" }
dependencies = [
{ name = "leann-core" },
@@ -1858,14 +1858,14 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" },
{ name = "leann-core", specifier = "==0.1.14" },
{ name = "numpy" },
{ name = "protobuf", specifier = ">=3.19.0" },
]
[[package]]
name = "leann-backend-hnsw"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-backend-hnsw" }
dependencies = [
{ name = "leann-core" },
@@ -1877,7 +1877,7 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" },
{ name = "leann-core", specifier = "==0.1.14" },
{ name = "msgpack", specifier = ">=1.0.0" },
{ name = "numpy" },
{ name = "pyzmq", specifier = ">=23.0.0" },
@@ -1885,7 +1885,7 @@ requires-dist = [
[[package]]
name = "leann-core"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-core" }
dependencies = [
{ name = "accelerate" },