benchmarks: unify HNSW & DiskANN into one clean script; isolate groups, fixed ports, warm-up, param complexity
This commit is contained in:
@@ -1,162 +1,172 @@
|
||||
import argparse
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from leann import LeannBuilder, LeannSearcher
|
||||
|
||||
|
||||
def ensure_index(
|
||||
index_path: str, num_docs: int = 5000, is_recompute: bool = True, is_compact: bool = True
|
||||
):
|
||||
path = Path(index_path)
|
||||
if (path.parent / f"{path.stem}.meta.json").exists():
|
||||
return
|
||||
def _free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
return sock.getsockname()[1]
|
||||
|
||||
|
||||
def _meta_exists(index_path: str) -> bool:
|
||||
p = Path(index_path)
|
||||
return (p.parent / f"{p.stem}.meta.json").exists()
|
||||
|
||||
|
||||
def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None:
|
||||
if _meta_exists(index_path):
|
||||
return
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=is_compact,
|
||||
is_compact=is_recompute, # HNSW: compact only when recompute
|
||||
is_recompute=is_recompute,
|
||||
num_threads=4,
|
||||
)
|
||||
|
||||
for i in range(num_docs):
|
||||
builder.add_text(
|
||||
f"This is a test document number {i}. It contains some repeated text for benchmarking."
|
||||
)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
|
||||
def bench_once(index_path: str, recompute: bool, top_k: int = 10) -> float:
|
||||
searcher = LeannSearcher(index_path=index_path)
|
||||
t0 = time.time()
|
||||
_ = searcher.search(
|
||||
"test document number 42",
|
||||
top_k=top_k,
|
||||
def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None:
|
||||
if _meta_exists(index_path):
|
||||
return
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
prune_ratio=0.0,
|
||||
recompute_embeddings=recompute,
|
||||
is_recompute=is_recompute,
|
||||
num_threads=4,
|
||||
)
|
||||
return time.time() - t0
|
||||
for i in range(num_docs):
|
||||
label = "R" if is_recompute else "NR"
|
||||
builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.")
|
||||
builder.build_index(index_path)
|
||||
|
||||
|
||||
def _bench_group(
|
||||
index_path: str,
|
||||
recompute: bool,
|
||||
query: str,
|
||||
repeats: int,
|
||||
complexity: int = 32,
|
||||
top_k: int = 10,
|
||||
) -> float:
|
||||
# Independent searcher per group; fixed port when recompute
|
||||
searcher = LeannSearcher(index_path=index_path)
|
||||
port = _free_port() if recompute else 0
|
||||
|
||||
# Warm-up once
|
||||
_ = searcher.search(
|
||||
query,
|
||||
top_k=top_k,
|
||||
complexity=complexity,
|
||||
recompute_embeddings=recompute,
|
||||
expected_zmq_port=port,
|
||||
)
|
||||
|
||||
def _once() -> float:
|
||||
t0 = time.time()
|
||||
_ = searcher.search(
|
||||
query,
|
||||
top_k=top_k,
|
||||
complexity=complexity,
|
||||
recompute_embeddings=recompute,
|
||||
expected_zmq_port=port,
|
||||
)
|
||||
return time.time() - t0
|
||||
|
||||
if repeats <= 1:
|
||||
t = _once()
|
||||
else:
|
||||
vals = [_once() for _ in range(repeats)]
|
||||
vals.sort()
|
||||
t = vals[len(vals) // 2]
|
||||
|
||||
searcher.cleanup()
|
||||
return t
|
||||
|
||||
|
||||
def main():
|
||||
base = Path.cwd() / ".leann" / "indexes" / "bench"
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-docs", type=int, default=5000)
|
||||
parser.add_argument("--repeats", type=int, default=3)
|
||||
parser.add_argument("--complexity", type=int, default=32)
|
||||
args = parser.parse_args()
|
||||
|
||||
base = Path.cwd() / ".leann" / "indexes" / f"bench_n{args.num_docs}"
|
||||
base.parent.mkdir(parents=True, exist_ok=True)
|
||||
index_path_recompute = str(base / "recompute.leann")
|
||||
index_path_norecompute = str(base / "norecompute.leann")
|
||||
# ---------- Build HNSW variants ----------
|
||||
hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
|
||||
hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
|
||||
ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True)
|
||||
ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False)
|
||||
|
||||
# Build two variants: pruned (recompute) and non-compact (no-recompute)
|
||||
ensure_index(index_path_recompute, is_recompute=True, is_compact=True)
|
||||
ensure_index(index_path_norecompute, is_recompute=False, is_compact=False)
|
||||
# ---------- Build DiskANN variants ----------
|
||||
diskann_r = str(base / "diskann_r.leann")
|
||||
diskann_nr = str(base / "diskann_nr.leann")
|
||||
ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True)
|
||||
ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False)
|
||||
|
||||
# Warm up
|
||||
bench_once(index_path_recompute, recompute=True)
|
||||
bench_once(index_path_norecompute, recompute=False)
|
||||
|
||||
t_recompute = bench_once(index_path_recompute, recompute=True)
|
||||
t_norecompute = bench_once(index_path_norecompute, recompute=False)
|
||||
|
||||
# Compute sizes only for files belonging to each index prefix
|
||||
# ---------- Helpers ----------
|
||||
def _size_for(prefix: str) -> int:
|
||||
p = Path(prefix)
|
||||
base = p.parent
|
||||
stem = p.stem # e.g., 'recompute.leann'
|
||||
base_dir = p.parent
|
||||
stem = p.stem
|
||||
total = 0
|
||||
for f in base.iterdir():
|
||||
for f in base_dir.iterdir():
|
||||
if f.is_file() and f.name.startswith(stem):
|
||||
total += f.stat().st_size
|
||||
return total
|
||||
|
||||
size_recompute = _size_for(index_path_recompute)
|
||||
size_norecompute = _size_for(index_path_norecompute)
|
||||
# ---------- HNSW benchmark ----------
|
||||
t_hnsw_r = _bench_group(
|
||||
hnsw_r, True, "test document number 42", repeats=args.repeats, complexity=args.complexity
|
||||
)
|
||||
t_hnsw_nr = _bench_group(
|
||||
hnsw_nr, False, "test document number 42", repeats=args.repeats, complexity=args.complexity
|
||||
)
|
||||
size_hnsw_r = _size_for(hnsw_r)
|
||||
size_hnsw_nr = _size_for(hnsw_nr)
|
||||
|
||||
print("Benchmark results (HNSW):")
|
||||
print(f" recompute=True: search_time={t_hnsw_r:.3f}s, size={size_hnsw_r / 1024 / 1024:.1f}MB")
|
||||
print(
|
||||
f" recompute=True: search_time={t_recompute:.3f}s, size={size_recompute / 1024 / 1024:.1f}MB"
|
||||
f" recompute=False: search_time={t_hnsw_nr:.3f}s, size={size_hnsw_nr / 1024 / 1024:.1f}MB"
|
||||
)
|
||||
print(
|
||||
f" recompute=False: search_time={t_norecompute:.3f}s, size={size_norecompute / 1024 / 1024:.1f}MB"
|
||||
print(" Expectation: no-recompute should be faster but larger on disk.")
|
||||
|
||||
# ---------- DiskANN benchmark ----------
|
||||
t_diskann_r = _bench_group(
|
||||
diskann_r, True, "DiskANN R test doc 123", repeats=args.repeats, complexity=args.complexity
|
||||
)
|
||||
print("Expectation: no-recompute should be faster but larger on disk.")
|
||||
t_diskann_nr = _bench_group(
|
||||
diskann_nr,
|
||||
False,
|
||||
"DiskANN NR test doc 123",
|
||||
repeats=args.repeats,
|
||||
complexity=args.complexity,
|
||||
)
|
||||
size_diskann_r = _size_for(diskann_r)
|
||||
size_diskann_nr = _size_for(diskann_nr)
|
||||
|
||||
# DiskANN quick benchmark (final rerank vs no-recompute)
|
||||
try:
|
||||
index_path_diskann_nr = str(base / "diskann_nr.leann")
|
||||
index_path_diskann_r = str(base / "diskann_r.leann")
|
||||
|
||||
# Build DiskANN no-recompute (keeps full disk index)
|
||||
if not (
|
||||
Path(index_path_diskann_nr).parent / f"{Path(index_path_diskann_nr).stem}.meta.json"
|
||||
).exists():
|
||||
b = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
num_threads=4,
|
||||
is_recompute=False,
|
||||
)
|
||||
for i in range(5000):
|
||||
b.add_text(f"DiskANN NR test doc {i} for quick benchmark.")
|
||||
b.build_index(index_path_diskann_nr)
|
||||
|
||||
# Build DiskANN recompute (enables partition; prunes redundant files)
|
||||
if not (
|
||||
Path(index_path_diskann_r).parent / f"{Path(index_path_diskann_r).stem}.meta.json"
|
||||
).exists():
|
||||
b = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
num_threads=4,
|
||||
is_recompute=True,
|
||||
)
|
||||
for i in range(5000):
|
||||
b.add_text(f"DiskANN R test doc {i} for quick benchmark.")
|
||||
b.build_index(index_path_diskann_r)
|
||||
|
||||
# Measure size per build prefix
|
||||
def _size_for(prefix: str) -> int:
|
||||
p = Path(prefix)
|
||||
base_dir = p.parent
|
||||
stem = p.stem
|
||||
total = 0
|
||||
for f in base_dir.iterdir():
|
||||
if f.is_file() and f.name.startswith(stem):
|
||||
total += f.stat().st_size
|
||||
return total
|
||||
|
||||
size_diskann_nr = _size_for(index_path_diskann_nr)
|
||||
size_diskann_r = _size_for(index_path_diskann_r)
|
||||
|
||||
# Speed on recompute-build (final rerank vs no-recompute)
|
||||
s = LeannSearcher(index_path_diskann_r)
|
||||
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
|
||||
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
|
||||
|
||||
t0 = time.time()
|
||||
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
|
||||
t_diskann_nr = time.time() - t0
|
||||
|
||||
t0 = time.time()
|
||||
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
|
||||
t_diskann_r = time.time() - t0
|
||||
|
||||
print("\nBenchmark results (DiskANN):")
|
||||
print(f" build(recompute=False): size={size_diskann_nr / 1024 / 1024:.1f}MB")
|
||||
print(f" build(recompute=True, partition): size={size_diskann_r / 1024 / 1024:.1f}MB")
|
||||
print(f" search recompute=False: {t_diskann_nr:.3f}s (on recompute-build)")
|
||||
print(f" search recompute=True (final rerank): {t_diskann_r:.3f}s (on recompute-build)")
|
||||
except Exception as e:
|
||||
print(f"DiskANN quick benchmark skipped due to: {e}")
|
||||
print("\nBenchmark results (DiskANN):")
|
||||
print(f" build(recompute=True, partition): size={size_diskann_r / 1024 / 1024:.1f}MB")
|
||||
print(f" build(recompute=False): size={size_diskann_nr / 1024 / 1024:.1f}MB")
|
||||
print(f" search recompute=True (final rerank): {t_diskann_r:.3f}s")
|
||||
print(f" search recompute=False (PQ only): {t_diskann_nr:.3f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -360,25 +360,31 @@ Trade-offs:
|
||||
- Significantly higher storage (10–100× vs selective recomputation)
|
||||
- Slightly larger memory footprint during build and search
|
||||
|
||||
Real-world quick benchmark (HNSW, 5k texts; script `benchmarks/benchmark_no_recompute.py`):
|
||||
Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts):
|
||||
|
||||
```text
|
||||
recompute=True: ~7.55s; size ~1.1MB
|
||||
recompute=False: ~0.11s; size ~16.6MB
|
||||
- HNSW
|
||||
|
||||
Conclusion: no-recompute is much faster but uses more storage; recompute is smaller but has higher first-hop latency.
|
||||
```
|
||||
```text
|
||||
recompute=True: ~7.55s; size ~1.1MB
|
||||
recompute=False: ~0.11s; size ~16.6MB
|
||||
```
|
||||
|
||||
DiskANN (5k texts; same script, final rerank strategy):
|
||||
- DiskANN
|
||||
|
||||
```text
|
||||
build(recompute=False): size ~24.8MB
|
||||
build(recompute=True, partition): size ~5.7MB
|
||||
search recompute=False: ~0.250s (on recompute-build)
|
||||
search recompute=True (final rerank): ~0.120s (on recompute-build)
|
||||
```text
|
||||
Build sizes (5k):
|
||||
- recompute=True (partition): ~5.7MB
|
||||
- recompute=False: ~24.8MB
|
||||
Search latency (on recompute-build, median of 5 runs; macOS, complexity=32):
|
||||
- recompute=False (PQ traversal only): ~0.013–0.014s
|
||||
- recompute=True (final rerank): ~0.033–0.046s
|
||||
On 20k texts (same settings):
|
||||
- recompute=False: ~0.013–0.014s
|
||||
- recompute=True: ~0.033–0.036s
|
||||
```
|
||||
|
||||
Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~20–30ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage.
|
||||
|
||||
Conclusion: DiskANN's recompute-build enables partitioning to reduce storage; enabling final rerank further improves latency while keeping traversal PQ-fast.
|
||||
```
|
||||
|
||||
|
||||
## Further Reading
|
||||
|
||||
@@ -441,14 +441,13 @@ class DiskannSearcher(BaseSearcher):
|
||||
else: # "global"
|
||||
use_global_pruning = True
|
||||
|
||||
# Perform search with suppressed C++ output based on log level
|
||||
# Strategy:
|
||||
# - Traversal always uses PQ distances
|
||||
# - If recompute_embeddings=True, do a single final rerank via deferred fetch
|
||||
# (fetch embeddings for the final candidate set only)
|
||||
# - Do not recompute neighbor distances along the path
|
||||
use_deferred_fetch = True if recompute_embeddings else False
|
||||
recompute_neighors = False
|
||||
recompute_neighors = False # Expected typo. For backward compatibility.
|
||||
|
||||
with suppress_cpp_output_if_needed():
|
||||
labels, distances = self._index.batch_search(
|
||||
|
||||
10
uv.lock
generated
10
uv.lock
generated
@@ -2223,7 +2223,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.2.8"
|
||||
version = "0.2.9"
|
||||
source = { editable = "packages/leann-backend-diskann" }
|
||||
dependencies = [
|
||||
{ name = "leann-core" },
|
||||
@@ -2235,14 +2235,14 @@ dependencies = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "leann-core", specifier = "==0.2.8" },
|
||||
{ name = "leann-core", specifier = "==0.2.9" },
|
||||
{ name = "numpy" },
|
||||
{ name = "protobuf", specifier = ">=3.19.0" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.2.8"
|
||||
version = "0.2.9"
|
||||
source = { editable = "packages/leann-backend-hnsw" }
|
||||
dependencies = [
|
||||
{ name = "leann-core" },
|
||||
@@ -2255,7 +2255,7 @@ dependencies = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "leann-core", specifier = "==0.2.8" },
|
||||
{ name = "leann-core", specifier = "==0.2.9" },
|
||||
{ name = "msgpack", specifier = ">=1.0.0" },
|
||||
{ name = "numpy" },
|
||||
{ name = "pyzmq", specifier = ">=23.0.0" },
|
||||
@@ -2263,7 +2263,7 @@ requires-dist = [
|
||||
|
||||
[[package]]
|
||||
name = "leann-core"
|
||||
version = "0.2.8"
|
||||
version = "0.2.9"
|
||||
source = { editable = "packages/leann-core" }
|
||||
dependencies = [
|
||||
{ name = "accelerate" },
|
||||
|
||||
Reference in New Issue
Block a user