docs: diskann recompute
This commit is contained in:
@@ -1,35 +1,31 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import socket
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from leann import LeannBuilder, LeannSearcher
|
from leann import LeannBuilder, LeannSearcher
|
||||||
|
|
||||||
|
|
||||||
def _free_port() -> int:
|
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
||||||
sock.bind(("127.0.0.1", 0))
|
|
||||||
return sock.getsockname()[1]
|
|
||||||
|
|
||||||
|
|
||||||
def _meta_exists(index_path: str) -> bool:
|
def _meta_exists(index_path: str) -> bool:
|
||||||
p = Path(index_path)
|
p = Path(index_path)
|
||||||
return (p.parent / f"{p.stem}.meta.json").exists()
|
return (p.parent / f"{p.stem}.meta.json").exists()
|
||||||
|
|
||||||
|
|
||||||
def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None:
|
def ensure_index(index_path: str, backend_name: str, num_docs: int, is_recompute: bool) -> None:
|
||||||
if _meta_exists(index_path):
|
# if _meta_exists(index_path):
|
||||||
return
|
# return
|
||||||
|
kwargs = {}
|
||||||
|
if backend_name == "hnsw":
|
||||||
|
kwargs["is_compact"] = is_recompute
|
||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name="hnsw",
|
backend_name=backend_name,
|
||||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
||||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
||||||
graph_degree=32,
|
graph_degree=32,
|
||||||
complexity=64,
|
complexity=64,
|
||||||
is_compact=is_recompute, # HNSW: compact only when recompute
|
|
||||||
is_recompute=is_recompute,
|
is_recompute=is_recompute,
|
||||||
num_threads=4,
|
num_threads=4,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
for i in range(num_docs):
|
for i in range(num_docs):
|
||||||
builder.add_text(
|
builder.add_text(
|
||||||
@@ -38,24 +34,6 @@ def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> Non
|
|||||||
builder.build_index(index_path)
|
builder.build_index(index_path)
|
||||||
|
|
||||||
|
|
||||||
def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None:
|
|
||||||
if _meta_exists(index_path):
|
|
||||||
return
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
|
||||||
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
|
||||||
graph_degree=32,
|
|
||||||
complexity=64,
|
|
||||||
is_recompute=is_recompute,
|
|
||||||
num_threads=4,
|
|
||||||
)
|
|
||||||
for i in range(num_docs):
|
|
||||||
label = "R" if is_recompute else "NR"
|
|
||||||
builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.")
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
|
|
||||||
def _bench_group(
|
def _bench_group(
|
||||||
index_path: str,
|
index_path: str,
|
||||||
recompute: bool,
|
recompute: bool,
|
||||||
@@ -66,7 +44,6 @@ def _bench_group(
|
|||||||
) -> float:
|
) -> float:
|
||||||
# Independent searcher per group; fixed port when recompute
|
# Independent searcher per group; fixed port when recompute
|
||||||
searcher = LeannSearcher(index_path=index_path)
|
searcher = LeannSearcher(index_path=index_path)
|
||||||
port = _free_port() if recompute else 0
|
|
||||||
|
|
||||||
# Warm-up once
|
# Warm-up once
|
||||||
_ = searcher.search(
|
_ = searcher.search(
|
||||||
@@ -74,7 +51,6 @@ def _bench_group(
|
|||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
complexity=complexity,
|
complexity=complexity,
|
||||||
recompute_embeddings=recompute,
|
recompute_embeddings=recompute,
|
||||||
expected_zmq_port=port,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _once() -> float:
|
def _once() -> float:
|
||||||
@@ -84,7 +60,6 @@ def _bench_group(
|
|||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
complexity=complexity,
|
complexity=complexity,
|
||||||
recompute_embeddings=recompute,
|
recompute_embeddings=recompute,
|
||||||
expected_zmq_port=port,
|
|
||||||
)
|
)
|
||||||
return time.time() - t0
|
return time.time() - t0
|
||||||
|
|
||||||
@@ -111,14 +86,14 @@ def main():
|
|||||||
# ---------- Build HNSW variants ----------
|
# ---------- Build HNSW variants ----------
|
||||||
hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
|
hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
|
||||||
hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
|
hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
|
||||||
ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True)
|
ensure_index(hnsw_r, "hnsw", args.num_docs, True)
|
||||||
ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False)
|
ensure_index(hnsw_nr, "hnsw", args.num_docs, False)
|
||||||
|
|
||||||
# ---------- Build DiskANN variants ----------
|
# ---------- Build DiskANN variants ----------
|
||||||
diskann_r = str(base / "diskann_r.leann")
|
diskann_r = str(base / "diskann_r.leann")
|
||||||
diskann_nr = str(base / "diskann_nr.leann")
|
diskann_nr = str(base / "diskann_nr.leann")
|
||||||
ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True)
|
ensure_index(diskann_r, "diskann", args.num_docs, True)
|
||||||
ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False)
|
ensure_index(diskann_nr, "diskann", args.num_docs, False)
|
||||||
|
|
||||||
# ---------- Helpers ----------
|
# ---------- Helpers ----------
|
||||||
def _size_for(prefix: str) -> int:
|
def _size_for(prefix: str) -> int:
|
||||||
|
|||||||
@@ -97,26 +97,20 @@ ollama pull nomic-embed-text
|
|||||||
```
|
```
|
||||||
|
|
||||||
### DiskANN
|
### DiskANN
|
||||||
**Best for**: Performance-critical applications and large datasets - **Production-ready with automatic graph partitioning**
|
**Best for**: Large datasets, especially when you want `recompute=True`.
|
||||||
|
|
||||||
**How it works:**
|
**Key advantages:**
|
||||||
- **Product Quantization (PQ) + Real-time Reranking**: Uses compressed PQ codes for fast graph traversal, then recomputes exact embeddings for final candidates
|
- **Faster search** on large datasets (3x+ speedup vs HNSW in many cases)
|
||||||
- **Automatic Graph Partitioning**: When `is_recompute=True`, automatically partitions large indices and safely removes redundant files to save storage
|
- **Smart storage**: `recompute=True` enables automatic graph partitioning for smaller indexes
|
||||||
- **Superior Speed-Accuracy Trade-off**: Faster search than HNSW while maintaining high accuracy
|
- **Better scaling**: Designed for 100k+ documents
|
||||||
|
|
||||||
**Trade-offs compared to HNSW:**
|
**Recompute behavior:**
|
||||||
- ✅ **Faster search latency** (typically 2-8x speedup)
|
- `recompute=True` (recommended): Pure PQ traversal + final reranking - faster and enables partitioning
|
||||||
- ✅ **Better scaling** for large datasets
|
- `recompute=False`: PQ + partial real distances during traversal - slower but higher accuracy
|
||||||
- ✅ **Smart storage management** with automatic partitioning
|
|
||||||
- ✅ **Better graph locality** with `--ldg-times` parameter for SSD optimization
|
|
||||||
- ⚠️ **Slightly larger index size** due to PQ tables and graph metadata
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Recommended for most use cases
|
# Recommended for most use cases
|
||||||
--backend-name diskann --graph-degree 32 --build-complexity 64
|
--backend-name diskann --graph-degree 32 --build-complexity 64
|
||||||
|
|
||||||
# For large-scale deployments
|
|
||||||
--backend-name diskann --graph-degree 64 --build-complexity 128
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system.
|
**Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system.
|
||||||
@@ -360,30 +354,25 @@ Trade-offs:
|
|||||||
- Significantly higher storage (10–100× vs selective recomputation)
|
- Significantly higher storage (10–100× vs selective recomputation)
|
||||||
- Slightly larger memory footprint during build and search
|
- Slightly larger memory footprint during build and search
|
||||||
|
|
||||||
Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts):
|
Quick benchmark results (`benchmarks/benchmark_no_recompute.py` with 5k texts, complexity=32):
|
||||||
|
|
||||||
- HNSW
|
- HNSW
|
||||||
|
|
||||||
```text
|
```text
|
||||||
recompute=True: ~7.55s; size ~1.1MB
|
recompute=True: search_time=0.818s, size=1.1MB
|
||||||
recompute=False: ~0.11s; size ~16.6MB
|
recompute=False: search_time=0.012s, size=16.6MB
|
||||||
```
|
```
|
||||||
|
|
||||||
- DiskANN
|
- DiskANN
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Build sizes (5k):
|
recompute=True: search_time=0.041s, size=5.9MB
|
||||||
- recompute=True (partition): ~5.7MB
|
recompute=False: search_time=0.013s, size=24.6MB
|
||||||
- recompute=False: ~24.8MB
|
|
||||||
Search latency (on recompute-build, median of 5 runs; macOS, complexity=32):
|
|
||||||
- recompute=False (PQ traversal only): ~0.013–0.014s
|
|
||||||
- recompute=True (final rerank): ~0.033–0.046s
|
|
||||||
On 20k texts (same settings):
|
|
||||||
- recompute=False: ~0.013–0.014s
|
|
||||||
- recompute=True: ~0.033–0.036s
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~20–30ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage.
|
Conclusion:
|
||||||
|
- **HNSW**: `no-recompute` is significantly faster (no embedding recomputation) but requires much more storage (stores all embeddings)
|
||||||
|
- **DiskANN**: `no-recompute` uses PQ + partial real distances during traversal (slower but higher accuracy), while `recompute=True` uses pure PQ traversal + final reranking (faster traversal, enables build-time partitioning for smaller storage)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user