fix: auto-detect normalized embeddings and use cosine distance (#8)

* fix: auto-detect normalized embeddings and use cosine distance

- Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere)
- Automatically set distance_metric='cosine' for normalized embeddings
- Add warnings when using non-optimal distance metrics
- Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2)
- Fix DiskANN zmq_port compatibility with lazy loading strategy
- Add documentation for normalized embeddings feature

This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.

* style: format
This commit is contained in:
Andy Lee
2025-07-27 21:19:29 -07:00
committed by GitHub
parent e9d2d420bd
commit 5c8921673a
6 changed files with 223 additions and 26 deletions

10
uv.lock generated
View File

@@ -1847,7 +1847,7 @@ wheels = [
[[package]]
name = "leann-backend-diskann"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-backend-diskann" }
dependencies = [
{ name = "leann-core" },
@@ -1858,14 +1858,14 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" },
{ name = "leann-core", specifier = "==0.1.14" },
{ name = "numpy" },
{ name = "protobuf", specifier = ">=3.19.0" },
]
[[package]]
name = "leann-backend-hnsw"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-backend-hnsw" }
dependencies = [
{ name = "leann-core" },
@@ -1877,7 +1877,7 @@ dependencies = [
[package.metadata]
requires-dist = [
{ name = "leann-core", specifier = "==0.1.13" },
{ name = "leann-core", specifier = "==0.1.14" },
{ name = "msgpack", specifier = ">=1.0.0" },
{ name = "numpy" },
{ name = "pyzmq", specifier = ">=23.0.0" },
@@ -1885,7 +1885,7 @@ requires-dist = [
[[package]]
name = "leann-core"
version = "0.1.13"
version = "0.1.14"
source = { editable = "packages/leann-core" }
dependencies = [
{ name = "accelerate" },