docs: diskann recompute

This commit is contained in:
Andy Lee
2025-08-14 14:25:50 -07:00
parent d5f6ca61ed
commit e3518a31ed
2 changed files with 28 additions and 64 deletions

View File

@@ -1,35 +1,31 @@
import argparse import argparse
import os import os
import socket
import time import time
from pathlib import Path from pathlib import Path
from leann import LeannBuilder, LeannSearcher from leann import LeannBuilder, LeannSearcher
def _free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
return sock.getsockname()[1]
def _meta_exists(index_path: str) -> bool: def _meta_exists(index_path: str) -> bool:
p = Path(index_path) p = Path(index_path)
return (p.parent / f"{p.stem}.meta.json").exists() return (p.parent / f"{p.stem}.meta.json").exists()
def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None: def ensure_index(index_path: str, backend_name: str, num_docs: int, is_recompute: bool) -> None:
if _meta_exists(index_path): # if _meta_exists(index_path):
return # return
kwargs = {}
if backend_name == "hnsw":
kwargs["is_compact"] = is_recompute
builder = LeannBuilder( builder = LeannBuilder(
backend_name="hnsw", backend_name=backend_name,
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"), embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"), embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32, graph_degree=32,
complexity=64, complexity=64,
is_compact=is_recompute, # HNSW: compact only when recompute
is_recompute=is_recompute, is_recompute=is_recompute,
num_threads=4, num_threads=4,
**kwargs,
) )
for i in range(num_docs): for i in range(num_docs):
builder.add_text( builder.add_text(
@@ -38,24 +34,6 @@ def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> Non
builder.build_index(index_path) builder.build_index(index_path)
def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None:
if _meta_exists(index_path):
return
builder = LeannBuilder(
backend_name="diskann",
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
graph_degree=32,
complexity=64,
is_recompute=is_recompute,
num_threads=4,
)
for i in range(num_docs):
label = "R" if is_recompute else "NR"
builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.")
builder.build_index(index_path)
def _bench_group( def _bench_group(
index_path: str, index_path: str,
recompute: bool, recompute: bool,
@@ -66,7 +44,6 @@ def _bench_group(
) -> float: ) -> float:
# Independent searcher per group; fixed port when recompute # Independent searcher per group; fixed port when recompute
searcher = LeannSearcher(index_path=index_path) searcher = LeannSearcher(index_path=index_path)
port = _free_port() if recompute else 0
# Warm-up once # Warm-up once
_ = searcher.search( _ = searcher.search(
@@ -74,7 +51,6 @@ def _bench_group(
top_k=top_k, top_k=top_k,
complexity=complexity, complexity=complexity,
recompute_embeddings=recompute, recompute_embeddings=recompute,
expected_zmq_port=port,
) )
def _once() -> float: def _once() -> float:
@@ -84,7 +60,6 @@ def _bench_group(
top_k=top_k, top_k=top_k,
complexity=complexity, complexity=complexity,
recompute_embeddings=recompute, recompute_embeddings=recompute,
expected_zmq_port=port,
) )
return time.time() - t0 return time.time() - t0
@@ -111,14 +86,14 @@ def main():
# ---------- Build HNSW variants ---------- # ---------- Build HNSW variants ----------
hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann") hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann") hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True) ensure_index(hnsw_r, "hnsw", args.num_docs, True)
ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False) ensure_index(hnsw_nr, "hnsw", args.num_docs, False)
# ---------- Build DiskANN variants ---------- # ---------- Build DiskANN variants ----------
diskann_r = str(base / "diskann_r.leann") diskann_r = str(base / "diskann_r.leann")
diskann_nr = str(base / "diskann_nr.leann") diskann_nr = str(base / "diskann_nr.leann")
ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True) ensure_index(diskann_r, "diskann", args.num_docs, True)
ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False) ensure_index(diskann_nr, "diskann", args.num_docs, False)
# ---------- Helpers ---------- # ---------- Helpers ----------
def _size_for(prefix: str) -> int: def _size_for(prefix: str) -> int:

View File

@@ -97,26 +97,20 @@ ollama pull nomic-embed-text
``` ```
### DiskANN ### DiskANN
**Best for**: Performance-critical applications and large datasets - **Production-ready with automatic graph partitioning** **Best for**: Large datasets, especially when you want `recompute=True`.
**How it works:** **Key advantages:**
- **Product Quantization (PQ) + Real-time Reranking**: Uses compressed PQ codes for fast graph traversal, then recomputes exact embeddings for final candidates - **Faster search** on large datasets (3x+ speedup vs HNSW in many cases)
- **Automatic Graph Partitioning**: When `is_recompute=True`, automatically partitions large indices and safely removes redundant files to save storage - **Smart storage**: `recompute=True` enables automatic graph partitioning for smaller indexes
- **Superior Speed-Accuracy Trade-off**: Faster search than HNSW while maintaining high accuracy - **Better scaling**: Designed for 100k+ documents
**Trade-offs compared to HNSW:** **Recompute behavior:**
- **Faster search latency** (typically 2-8x speedup) - `recompute=True` (recommended): Pure PQ traversal + final reranking - faster and enables partitioning
- **Better scaling** for large datasets - `recompute=False`: PQ + partial real distances during traversal - slower but higher accuracy
-**Smart storage management** with automatic partitioning
-**Better graph locality** with `--ldg-times` parameter for SSD optimization
- ⚠️ **Slightly larger index size** due to PQ tables and graph metadata
```bash ```bash
# Recommended for most use cases # Recommended for most use cases
--backend-name diskann --graph-degree 32 --build-complexity 64 --backend-name diskann --graph-degree 32 --build-complexity 64
# For large-scale deployments
--backend-name diskann --graph-degree 64 --build-complexity 128
``` ```
**Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system. **Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system.
@@ -360,30 +354,25 @@ Trade-offs:
- Significantly higher storage (10100× vs selective recomputation) - Significantly higher storage (10100× vs selective recomputation)
- Slightly larger memory footprint during build and search - Slightly larger memory footprint during build and search
Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts): Quick benchmark results (`benchmarks/benchmark_no_recompute.py` with 5k texts, complexity=32):
- HNSW - HNSW
```text ```text
recompute=True: ~7.55s; size ~1.1MB recompute=True: search_time=0.818s, size=1.1MB
recompute=False: ~0.11s; size ~16.6MB recompute=False: search_time=0.012s, size=16.6MB
``` ```
- DiskANN - DiskANN
```text ```text
Build sizes (5k): recompute=True: search_time=0.041s, size=5.9MB
- recompute=True (partition): ~5.7MB recompute=False: search_time=0.013s, size=24.6MB
- recompute=False: ~24.8MB
Search latency (on recompute-build, median of 5 runs; macOS, complexity=32):
- recompute=False (PQ traversal only): ~0.0130.014s
- recompute=True (final rerank): ~0.0330.046s
On 20k texts (same settings):
- recompute=False: ~0.0130.014s
- recompute=True: ~0.0330.036s
``` ```
Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~2030ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage. Conclusion:
- **HNSW**: `no-recompute` is significantly faster (no embedding recomputation) but requires much more storage (stores all embeddings)
- **DiskANN**: `no-recompute` uses PQ + partial real distances during traversal (slower but higher accuracy), while `recompute=True` uses pure PQ traversal + final reranking (faster traversal, enables build-time partitioning for smaller storage)