From e3518a31ed5001d273375242114bd3ebae600b34 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Thu, 14 Aug 2025 14:25:50 -0700
Subject: [PATCH] docs: diskann recompute

---
 benchmarks/benchmark_no_recompute.py | 49 +++++++---------------------
 docs/configuration-guide.md          | 43 +++++++++---------------
 2 files changed, 28 insertions(+), 64 deletions(-)

diff --git a/benchmarks/benchmark_no_recompute.py b/benchmarks/benchmark_no_recompute.py
index aaeb1e8..1c402c0 100644
--- a/benchmarks/benchmark_no_recompute.py
+++ b/benchmarks/benchmark_no_recompute.py
@@ -1,35 +1,31 @@
 import argparse
 import os
-import socket
 import time
 from pathlib import Path
 
 from leann import LeannBuilder, LeannSearcher
 
 
-def _free_port() -> int:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-        sock.bind(("127.0.0.1", 0))
-        return sock.getsockname()[1]
-
-
 def _meta_exists(index_path: str) -> bool:
     p = Path(index_path)
     return (p.parent / f"{p.stem}.meta.json").exists()
 
 
-def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> None:
-    if _meta_exists(index_path):
-        return
+def ensure_index(index_path: str, backend_name: str, num_docs: int, is_recompute: bool) -> None:
+    # if _meta_exists(index_path):
+    #     return
+    kwargs = {}
+    if backend_name == "hnsw":
+        kwargs["is_compact"] = is_recompute
     builder = LeannBuilder(
-        backend_name="hnsw",
+        backend_name=backend_name,
         embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
         embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
         graph_degree=32,
         complexity=64,
-        is_compact=is_recompute,  # HNSW: compact only when recompute
         is_recompute=is_recompute,
         num_threads=4,
+        **kwargs,
     )
     for i in range(num_docs):
         builder.add_text(
@@ -38,24 +34,6 @@ def ensure_index_hnsw(index_path: str, num_docs: int, is_recompute: bool) -> Non
     builder.build_index(index_path)
 
 
-def ensure_index_diskann(index_path: str, num_docs: int, is_recompute: bool) -> None:
-    if _meta_exists(index_path):
-        return
-    builder = LeannBuilder(
-        backend_name="diskann",
-        embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
-        embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
-        graph_degree=32,
-        complexity=64,
-        is_recompute=is_recompute,
-        num_threads=4,
-    )
-    for i in range(num_docs):
-        label = "R" if is_recompute else "NR"
-        builder.add_text(f"DiskANN {label} test doc {i} for quick benchmark.")
-    builder.build_index(index_path)
-
-
 def _bench_group(
     index_path: str,
     recompute: bool,
@@ -66,7 +44,6 @@ def _bench_group(
 ) -> float:
     # Independent searcher per group; fixed port when recompute
     searcher = LeannSearcher(index_path=index_path)
-    port = _free_port() if recompute else 0
 
     # Warm-up once
     _ = searcher.search(
@@ -74,7 +51,6 @@ def _bench_group(
         top_k=top_k,
         complexity=complexity,
         recompute_embeddings=recompute,
-        expected_zmq_port=port,
     )
 
     def _once() -> float:
@@ -84,7 +60,6 @@ def _bench_group(
             top_k=top_k,
             complexity=complexity,
             recompute_embeddings=recompute,
-            expected_zmq_port=port,
         )
         return time.time() - t0
 
@@ -111,14 +86,14 @@ def main():
     # ---------- Build HNSW variants ----------
     hnsw_r = str(base / f"hnsw_recompute_n{args.num_docs}.leann")
     hnsw_nr = str(base / f"hnsw_norecompute_n{args.num_docs}.leann")
-    ensure_index_hnsw(hnsw_r, num_docs=args.num_docs, is_recompute=True)
-    ensure_index_hnsw(hnsw_nr, num_docs=args.num_docs, is_recompute=False)
+    ensure_index(hnsw_r, "hnsw", args.num_docs, True)
+    ensure_index(hnsw_nr, "hnsw", args.num_docs, False)
 
     # ---------- Build DiskANN variants ----------
     diskann_r = str(base / "diskann_r.leann")
     diskann_nr = str(base / "diskann_nr.leann")
-    ensure_index_diskann(diskann_r, num_docs=args.num_docs, is_recompute=True)
-    ensure_index_diskann(diskann_nr, num_docs=args.num_docs, is_recompute=False)
+    ensure_index(diskann_r, "diskann", args.num_docs, True)
+    ensure_index(diskann_nr, "diskann", args.num_docs, False)
 
     # ---------- Helpers ----------
     def _size_for(prefix: str) -> int:
diff --git a/docs/configuration-guide.md b/docs/configuration-guide.md
index f03935c..036c1c2 100644
--- a/docs/configuration-guide.md
+++ b/docs/configuration-guide.md
@@ -97,26 +97,20 @@ ollama pull nomic-embed-text
 ```
 
 ### DiskANN
-**Best for**: Performance-critical applications and large datasets - **Production-ready with automatic graph partitioning**
+**Best for**: Large datasets, especially when you want `recompute=True`.
 
-**How it works:**
-- **Product Quantization (PQ) + Real-time Reranking**: Uses compressed PQ codes for fast graph traversal, then recomputes exact embeddings for final candidates
-- **Automatic Graph Partitioning**: When `is_recompute=True`, automatically partitions large indices and safely removes redundant files to save storage
-- **Superior Speed-Accuracy Trade-off**: Faster search than HNSW while maintaining high accuracy
+**Key advantages:**
+- **Faster search** on large datasets (3x+ speedup vs HNSW in many cases)
+- **Smart storage**: `recompute=True` enables automatic graph partitioning for smaller indexes
+- **Better scaling**: Designed for 100k+ documents
 
-**Trade-offs compared to HNSW:**
-- ✅ **Faster search latency** (typically 2-8x speedup)
-- ✅ **Better scaling** for large datasets
-- ✅ **Smart storage management** with automatic partitioning
-- ✅ **Better graph locality** with `--ldg-times` parameter for SSD optimization
-- ⚠️ **Slightly larger index size** due to PQ tables and graph metadata
+**Recompute behavior:**
+- `recompute=True` (recommended): Pure PQ traversal + final reranking - faster and enables partitioning
+- `recompute=False`: PQ + partial real distances during traversal - slower but higher accuracy
 
 ```bash
 # Recommended for most use cases
 --backend-name diskann --graph-degree 32 --build-complexity 64
-
-# For large-scale deployments
---backend-name diskann --graph-degree 64 --build-complexity 128
 ```
 
 **Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system.
@@ -360,30 +354,25 @@ Trade-offs:
 - Significantly higher storage (10–100× vs selective recomputation)
 - Slightly larger memory footprint during build and search
 
-Real-world quick benchmark (`benchmarks/benchmark_no_recompute.py`, 5k texts):
+Quick benchmark results (`benchmarks/benchmark_no_recompute.py` with 5k texts, complexity=32):
 
 - HNSW
 
   ```text
-  recompute=True:  ~7.55s; size ~1.1MB
-  recompute=False: ~0.11s; size ~16.6MB
+  recompute=True:  search_time=0.818s, size=1.1MB
+  recompute=False: search_time=0.012s, size=16.6MB
   ```
 
 - DiskANN
 
   ```text
-  Build sizes (5k):
-    - recompute=True (partition): ~5.7MB
-    - recompute=False: ~24.8MB
-  Search latency (on recompute-build, median of 5 runs; macOS, complexity=32):
-    - recompute=False (PQ traversal only): ~0.013–0.014s
-    - recompute=True (final rerank):        ~0.033–0.046s
-  On 20k texts (same settings):
-    - recompute=False: ~0.013–0.014s
-    - recompute=True:  ~0.033–0.036s
+  recompute=True:  search_time=0.041s, size=5.9MB
+  recompute=False: search_time=0.013s, size=24.6MB
   ```
 
-Conclusion: for HNSW, no-recompute is faster but larger; for DiskANN, no-recompute (PQ traversal only) is fastest at the cost of potentially lower accuracy, while recompute (final rerank) adds ~20–30ms for higher accuracy. DiskANN recompute-build also enables partitioning, reducing storage.
+Conclusion:
+- **HNSW**: `no-recompute` is significantly faster (no embedding recomputation) but requires much more storage (stores all embeddings)
+- **DiskANN**: `no-recompute` uses PQ + partial real distances during traversal (slower but higher accuracy), while `recompute=True` uses pure PQ traversal + final reranking (faster traversal, enables build-time partitioning for smaller storage)