benchmarks: unify HNSW & DiskANN into one clean script; isolate groups, fixed ports, warm-up, param complexity

This commit is contained in:
Andy Lee
2025-08-14 13:47:53 -07:00
parent b13b52e78c
commit d5f6ca61ed
4 changed files with 150 additions and 135 deletions

View File

@@ -1,162 +1,172 @@
import argparse
import os import os
import socket
import time import time
from pathlib import Path from pathlib import Path
from leann import LeannBuilder, LeannSearcher from leann import LeannBuilder, LeannSearcher
def ensure_index( def _free_port() -> int:
index_path: str, num_docs: int = 5000, is_recompute: bool = True, is_compact: bool = True with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
): sock.bind(("127.0.0.1", 0))
path = Path(index_path) return sock.getsockname()[1]
if (path.parent / f"{path.stem}.meta.json").exists():
return
def _meta_exists(index_path: str) -> bool:
p = Path(index_path)
return (p.parent / f"{p.stem}.meta.json").exists()
def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None:
if _meta_exists(index_path):
return
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name="hnsw",
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"), embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"), embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32, graph_degree=32,
complexity=64, complexity=64,
is_compact=is_compact, is_compact=is_recompute, # HNSW: compact only when recompute
is_recompute=is_recompute, is_recompute=is_recompute,
num_threads=4, num_threads=4,
) )
for i in range(num_docs): for i in range(num_docs):
builder.add_text( builder.add_text(
f"This is a test document number {i}. It contains some repeated text for benchmarking." f"This is a test document number {i}. It contains some repeated text for benchmarking."
) )
builder.build_index(index_path) builder.build_index(index_path)
def bench_once(index_path: str, recompute: bool, top_k: int = 10) -> float: def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None:
searcher = LeannSearcher(index_path=index_path) if _meta_exists(index_path):
t0 = time.time() return
_ = searcher.search( builder = LeannBuilder(
"test document number 42", backend_name="diskann",
top_k=top_k, embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32,
complexity=64, complexity=64,
prune_ratio=0.0, is_recompute=is_recompute,
recompute_embeddings=recompute, num_threads=4,
) )
return time.time() - t0 for i in range(num_docs):
label = "R" if is_recompute else "NR"
builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.")
builder.build_index(index_path)
def _bench_group(
index_path: str,
recompute: bool,
query: str,
repeats: int,
complexity: int = 32,
top_k: int = 10,
) -> float:
# Independent searcher per group; fixed port when recompute
searcher = LeannSearcher(index_path=index_path)
port = _free_port() if recompute else 0
# Warm-up once
_ = searcher.search(
query,
top_k=top_k,
complexity=complexity,
recompute_embeddings=recompute,
expected_zmq_port=port,
)
def _once() -> float:
t0 = time.time()
_ = searcher.search(
query,
top_k=top_k,
complexity=complexity,
recompute_embeddings=recompute,
expected_zmq_port=port,
)
return time.time() - t0
if repeats <= 1:
t = _once()
else:
vals = [_once() for _ in range(repeats)]
vals.sort()
t = vals[len(vals) // 2]
searcher.cleanup()
return t
def main(): def main():
base = Path.cwd() / ".leann" / "indexes" / "bench" parser = argparse.ArgumentParser()
parser.add_argument("--num-docs", type=int, default=5000)
parser.add_argument("--repeats", type=int, default=3)
parser.add_argument("--complexity", type=int, default=32)
args = parser.parse_args()
base = Path.cwd() / ".leann" / "indexes" / f"bench_n{args.num_docs}"
base.parent.mkdir(parents=True, exist_ok=True) base.parent.mkdir(parents=True, exist_ok=True)
index_path_recompute = str(base / "recompute.leann") # ---------- Build HNSW variants ----------
index_path_norecompute = str(base / "norecompute.leann") hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True)
ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False)
# Build two variants: pruned (recompute) and non-compact (no-recompute) # ---------- Build DiskANN variants ----------
ensure_index(index_path_recompute, is_recompute=True, is_compact=True) diskann_r = str(base / "diskann_r.leann")
ensure_index(index_path_norecompute, is_recompute=False, is_compact=False) diskann_nr = str(base / "diskann_nr.leann")
ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True)
ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False)
# Warm up # ---------- Helpers ----------
bench_once(index_path_recompute, recompute=True)
bench_once(index_path_norecompute, recompute=False)
t_recompute = bench_once(index_path_recompute, recompute=True)
t_norecompute = bench_once(index_path_norecompute, recompute=False)
# Compute sizes only for files belonging to each index prefix
def _size_for(prefix: str) -> int: def _size_for(prefix: str) -> int:
p = Path(prefix) p = Path(prefix)
base = p.parent base_dir = p.parent
stem = p.stem # e.g., 'recompute.leann' stem = p.stem
total = 0 total = 0
for f in base.iterdir(): for f in base_dir.iterdir():
if f.is_file() and f.name.startswith(stem): if f.is_file() and f.name.startswith(stem):
total += f.stat().st_size total += f.stat().st_size
return total return total
size_recompute = _size_for(index_path_recompute) # ---------- HNSW benchmark ----------
size_norecompute = _size_for(index_path_norecompute) t_hnsw_r = _bench_group(
hnsw_r, True, "test document number 42", repeats=args.repeats, complexity=args.complexity
)
t_hnsw_nr = _bench_group(
hnsw_nr, False, "test document number 42", repeats=args.repeats, complexity=args.complexity
)
size_hnsw_r = _size_for(hnsw_r)
size_hnsw_nr = _size_for(hnsw_nr)
print("Benchmark results (HNSW):") print("Benchmark results (HNSW):")
print(f" recompute=True: search_time={t_hnsw_r:.3f}s, size={size_hnsw_r / 1024 / 1024:.1f}MB")
print( print(
f" recompute=True: search_time={t_recompute:.3f}s, size={size_recompute / 1024 / 1024:.1f}MB" f" recompute=False: search_time={t_hnsw_nr:.3f}s, size={size_hnsw_nr / 1024 / 1024:.1f}MB"
) )
print( print(" Expectation: no-recompute should be faster but larger on disk.")
f" recompute=False: search_time={t_norecompute:.3f}s, size={size_norecompute / 1024 / 1024:.1f}MB"
# ---------- DiskANN benchmark ----------
t_diskann_r = _bench_group(
diskann_r, True, "DiskANN R test doc 123", repeats=args.repeats, complexity=args.complexity
) )
print("Expectation: no-recompute should be faster but larger on disk.") t_diskann_nr = _bench_group(
diskann_nr,
False,
"DiskANN NR test doc 123",
repeats=args.repeats,
complexity=args.complexity,
)
size_diskann_r = _size_for(diskann_r)
size_diskann_nr = _size_for(diskann_nr)
# DiskANN quick benchmark (final rerank vs no-recompute) print("\nBenchmark results (DiskANN):")
try: print(f" build(recompute=True, partition): size={size_diskann_r / 1024 / 1024:.1f}MB")
index_path_diskann_nr = str(base / "diskann_nr.leann") print(f" build(recompute=False): size={size_diskann_nr / 1024 / 1024:.1f}MB")
index_path_diskann_r = str(base / "diskann_r.leann") print(f" search recompute=True (final rerank): {t_diskann_r:.3f}s")
print(f" search recompute=False (PQ only): {t_diskann_nr:.3f}s")
# Build DiskANN no-recompute (keeps full disk index)
if not (
Path(index_path_diskann_nr).parent / f"{Path(index_path_diskann_nr).stem}.meta.json"
).exists():
b = LeannBuilder(
backend_name="diskann",
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32,
complexity=64,
num_threads=4,
is_recompute=False,
)
for i in range(5000):
b.add_text(f"DiskANN NR test doc {i} for quick benchmark.")
b.build_index(index_path_diskann_nr)
# Build DiskANN recompute (enables partition; prunes redundant files)
if not (
Path(index_path_diskann_r).parent / f"{Path(index_path_diskann_r).stem}.meta.json"
).exists():
b = LeannBuilder(
backend_name="diskann",
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32,
complexity=64,
num_threads=4,
is_recompute=True,
)
for i in range(5000):
b.add_text(f"DiskANN R test doc {i} for quick benchmark.")
b.build_index(index_path_diskann_r)
# Measure size per build prefix
def _size_for(prefix: str) -> int:
p = Path(prefix)
base_dir = p.parent
stem = p.stem
total = 0
for f in base_dir.iterdir():
if f.is_file() and f.name.startswith(stem):
total += f.stat().st_size
return total
size_diskann_nr = _size_for(index_path_diskann_nr)
size_diskann_r = _size_for(index_path_diskann_r)
# Speed on recompute-build (final rerank vs no-recompute)
s = LeannSearcher(index_path_diskann_r)
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
t0 = time.time()
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
t_diskann_nr = time.time() - t0
t0 = time.time()
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
t_diskann_r = time.time() - t0
print("\nBenchmark results (DiskANN):")
print(f" build(recompute=False): size={size_diskann_nr / 1024 / 1024:.1f}MB")
print(f" build(recompute=True, partition): size={size_diskann_r / 1024 / 1024:.1f}MB")
print(f" search recompute=False: {t_diskann_nr:.3f}s (on recompute-build)")
print(f" search recompute=True (final rerank): {t_diskann_r:.3f}s (on recompute-build)")
except Exception as e:
print(f"DiskANN quick benchmark skipped due to: {e}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -360,25 +360,31 @@ Trade-offs:
- Significantly higher storage (10100× vs selective recomputation) - Significantly higher storage (10100× vs selective recomputation)
- Slightly larger memory footprint during build and search - Slightly larger memory footprint during build and search
Real-world quick benchmark (HNSW, 5k texts; script `benchmarks/benchmark_no_recompute.py`): Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts):
```text - HNSW
recompute=True: ~7.55s; size ~1.1MB
recompute=False: ~0.11s; size ~16.6MB
Conclusion: no-recompute is much faster but uses more storage; recompute is smaller but has higher first-hop latency. ```text
``` recompute=True: ~7.55s; size ~1.1MB
recompute=False: ~0.11s; size ~16.6MB
```
DiskANN (5k texts; same script, final rerank strategy): - DiskANN
```text ```text
build(recompute=False): size ~24.8MB Build sizes (5k):
build(recompute=True, partition): size ~5.7MB - recompute=True (partition): ~5.7MB
search recompute=False: ~0.250s (on recompute-build) - recompute=False: ~24.8MB
search recompute=True (final rerank): ~0.120s (on recompute-build) Search latency (on recompute-build, median of 5 runs; macOS, complexity=32):
- recompute=False (PQ traversal only): ~0.0130.014s
- recompute=True (final rerank): ~0.0330.046s
On 20k texts (same settings):
- recompute=False: ~0.0130.014s
- recompute=True: ~0.0330.036s
```
Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~2030ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage.
Conclusion: DiskANN's recompute-build enables partitioning to reduce storage; enabling final rerank further improves latency while keeping traversal PQ-fast.
```
## Further Reading ## Further Reading

View File

@@ -441,14 +441,13 @@ class DiskannSearcher(BaseSearcher):
else: # "global" else: # "global"
use_global_pruning = True use_global_pruning = True
# Perform search with suppressed C++ output based on log level
# Strategy: # Strategy:
# - Traversal always uses PQ distances # - Traversal always uses PQ distances
# - If recompute_embeddings=True, do a single final rerank via deferred fetch # - If recompute_embeddings=True, do a single final rerank via deferred fetch
# (fetch embeddings for the final candidate set only) # (fetch embeddings for the final candidate set only)
# - Do not recompute neighbor distances along the path # - Do not recompute neighbor distances along the path
use_deferred_fetch = True if recompute_embeddings else False use_deferred_fetch = True if recompute_embeddings else False
recompute_neighors = False recompute_neighors = False # Expected typo. For backward compatibility.
with suppress_cpp_output_if_needed(): with suppress_cpp_output_if_needed():
labels, distances = self._index.batch_search( labels, distances = self._index.batch_search(

10
uv.lock generated
View File

@@ -2223,7 +2223,7 @@ wheels = [
[[package]] [[package]]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.2.8" version = "0.2.9"
source = { editable = "packages/leann-backend-diskann" } source = { editable = "packages/leann-backend-diskann" }
dependencies = [ dependencies = [
{ name = "leann-core" }, { name = "leann-core" },
@@ -2235,14 +2235,14 @@ dependencies = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "leann-core", specifier = "==0.2.8" }, { name = "leann-core", specifier = "==0.2.9" },
{ name = "numpy" }, { name = "numpy" },
{ name = "protobuf", specifier = ">=3.19.0" }, { name = "protobuf", specifier = ">=3.19.0" },
] ]
[[package]] [[package]]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.2.8" version = "0.2.9"
source = { editable = "packages/leann-backend-hnsw" } source = { editable = "packages/leann-backend-hnsw" }
dependencies = [ dependencies = [
{ name = "leann-core" }, { name = "leann-core" },
@@ -2255,7 +2255,7 @@ dependencies = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "leann-core", specifier = "==0.2.8" }, { name = "leann-core", specifier = "==0.2.9" },
{ name = "msgpack", specifier = ">=1.0.0" }, { name = "msgpack", specifier = ">=1.0.0" },
{ name = "numpy" }, { name = "numpy" },
{ name = "pyzmq", specifier = ">=23.0.0" }, { name = "pyzmq", specifier = ">=23.0.0" },
@@ -2263,7 +2263,7 @@ requires-dist = [
[[package]] [[package]]
name = "leann-core" name = "leann-core"
version = "0.2.8" version = "0.2.9"
source = { editable = "packages/leann-core" } source = { editable = "packages/leann-core" }
dependencies = [ dependencies = [
{ name = "accelerate" }, { name = "accelerate" },