From e3518a31ed5001d273375242114bd3ebae600b34 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Thu, 14 Aug 2025 14:25:50 -0700 Subject: [PATCH] docs: diskann recompute --- benchmarks/benchmark_no_recompute.py | 49 +++++++--------------------- docs/configuration-guide.md | 43 +++++++++--------------- 2 files changed, 28 insertions(+), 64 deletions(-) diff --git a/benchmarks/benchmark_no_recompute.py b/benchmarks/benchmark_no_recompute.py index aaeb1e8..1c402c0 100644 --- a/benchmarks/benchmark_no_recompute.py +++ b/benchmarks/benchmark_no_recompute.py @@ -1,35 +1,31 @@ import argparse import os -import socket import time from pathlib import Path from leann import LeannBuilder, LeannSearcher -def _free_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("127.0.0.1", 0)) - return sock.getsockname()[1] - - def _meta_exists(index_path: str) -> bool: p = Path(index_path) return (p.parent / f"{p.stem}.meta.json").exists() -def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None: - if _meta_exists(index_path): - return +def ensure_index(index_path: str, backend_name: str, num_docs: int, is_recompute: bool) -> None: + # if _meta_exists(index_path): + # return + kwargs = {} + if backend_name == "hnsw": + kwargs["is_compact"] = is_recompute builder = LeannBuilder( - backend_name="hnsw", + backend_name=backend_name, embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"), embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"), graph_degree=32, complexity=64, - is_compact=is_recompute, # HNSW: compact only when recompute is_recompute=is_recompute, num_threads=4, + **kwargs, ) for i in range(num_docs): builder.add_text( @@ -38,24 +34,6 @@ def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> Non builder.build_index(index_path) -def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None: - if _meta_exists(index_path): - return - builder = LeannBuilder( - backend_name="diskann", - embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"), - embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"), - graph_degree=32, - complexity=64, - is_recompute=is_recompute, - num_threads=4, - ) - for i in range(num_docs): - label = "R" if is_recompute else "NR" - builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.") - builder.build_index(index_path) - - def _bench_group( index_path: str, recompute: bool, @@ -66,7 +44,6 @@ def _bench_group( ) -> float: # Independent searcher per group; fixed port when recompute searcher = LeannSearcher(index_path=index_path) - port = _free_port() if recompute else 0 # Warm-up once _ = searcher.search( @@ -74,7 +51,6 @@ def _bench_group( top_k=top_k, complexity=complexity, recompute_embeddings=recompute, - expected_zmq_port=port, ) def _once() -> float: @@ -84,7 +60,6 @@ def _bench_group( top_k=top_k, complexity=complexity, recompute_embeddings=recompute, - expected_zmq_port=port, ) return time.time() - t0 @@ -111,14 +86,14 @@ def main(): # ---------- Build HNSW variants ---------- hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann") hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann") - ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True) - ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False) + ensure_index(hnsw_r, "hnsw", args.num_docs, True) + ensure_index(hnsw_nr, "hnsw", args.num_docs, False) # ---------- Build DiskANN variants ---------- diskann_r = str(base / "diskann_r.leann") diskann_nr = str(base / "diskann_nr.leann") - ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True) - ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False) + ensure_index(diskann_r, "diskann", args.num_docs, True) + ensure_index(diskann_nr, "diskann", args.num_docs, False) # ---------- Helpers ---------- def _size_for(prefix: str) -> int: diff --git a/docs/configuration-guide.md b/docs/configuration-guide.md index f03935c..036c1c2 100644 --- a/docs/configuration-guide.md +++ b/docs/configuration-guide.md @@ -97,26 +97,20 @@ ollama pull nomic-embed-text ``` ### DiskANN -**Best for**: Performance-critical applications and large datasets - **Production-ready with automatic graph partitioning** +**Best for**: Large datasets, especially when you want `recompute=True`. -**How it works:** -- **Product Quantization (PQ) + Real-time Reranking**: Uses compressed PQ codes for fast graph traversal, then recomputes exact embeddings for final candidates -- **Automatic Graph Partitioning**: When `is_recompute=True`, automatically partitions large indices and safely removes redundant files to save storage -- **Superior Speed-Accuracy Trade-off**: Faster search than HNSW while maintaining high accuracy +**Key advantages:** +- **Faster search** on large datasets (3x+ speedup vs HNSW in many cases) +- **Smart storage**: `recompute=True` enables automatic graph partitioning for smaller indexes +- **Better scaling**: Designed for 100k+ documents -**Trade-offs compared to HNSW:** -- ✅ **Faster search latency** (typically 2-8x speedup) -- ✅ **Better scaling** for large datasets -- ✅ **Smart storage management** with automatic partitioning -- ✅ **Better graph locality** with `--ldg-times` parameter for SSD optimization -- ⚠️ **Slightly larger index size** due to PQ tables and graph metadata +**Recompute behavior:** +- `recompute=True` (recommended): Pure PQ traversal + final reranking - faster and enables partitioning +- `recompute=False`: PQ + partial real distances during traversal - slower but higher accuracy ```bash # Recommended for most use cases --backend-name diskann --graph-degree 32 --build-complexity 64 - -# For large-scale deployments ---backend-name diskann --graph-degree 64 --build-complexity 128 ``` **Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system. @@ -360,30 +354,25 @@ Trade-offs: - Significantly higher storage (10–100× vs selective recomputation) - Slightly larger memory footprint during build and search -Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts): +Quick benchmark results (`benchmarks/benchmark_no_recompute.py` with 5k texts, complexity=32): - HNSW ```text - recompute=True: ~7.55s; size ~1.1MB - recompute=False: ~0.11s; size ~16.6MB + recompute=True: search_time=0.818s, size=1.1MB + recompute=False: search_time=0.012s, size=16.6MB ``` - DiskANN ```text - Build sizes (5k): - - recompute=True (partition): ~5.7MB - - recompute=False: ~24.8MB - Search latency (on recompute-build, median of 5 runs; macOS, complexity=32): - - recompute=False (PQ traversal only): ~0.013–0.014s - - recompute=True (final rerank): ~0.033–0.046s - On 20k texts (same settings): - - recompute=False: ~0.013–0.014s - - recompute=True: ~0.033–0.036s + recompute=True: search_time=0.041s, size=5.9MB + recompute=False: search_time=0.013s, size=24.6MB ``` -Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~20–30ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage. +Conclusion: +- **HNSW**: `no-recompute` is significantly faster (no embedding recomputation) but requires much more storage (stores all embeddings) +- **DiskANN**: `no-recompute` uses PQ + partial real distances during traversal (slower but higher accuracy), while `recompute=True` uses pure PQ traversal + final reranking (faster traversal, enables build-time partitioning for smaller storage)