164 lines
6.0 KiB
Python
164 lines
6.0 KiB
Python
import os
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from leann import LeannBuilder, LeannSearcher
|
|
|
|
|
|
def ensure_index(
|
|
index_path: str, num_docs: int = 5000, is_recompute: bool = True, is_compact: bool = True
|
|
):
|
|
path = Path(index_path)
|
|
if (path.parent / f"{path.stem}.meta.json").exists():
|
|
return
|
|
|
|
builder = LeannBuilder(
|
|
backend_name="hnsw",
|
|
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
|
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
|
graph_degree=32,
|
|
complexity=64,
|
|
is_compact=is_compact,
|
|
is_recompute=is_recompute,
|
|
num_threads=4,
|
|
)
|
|
|
|
for i in range(num_docs):
|
|
builder.add_text(
|
|
f"This is a test document number {i}. It contains some repeated text for benchmarking."
|
|
)
|
|
|
|
builder.build_index(index_path)
|
|
|
|
|
|
def bench_once(index_path: str, recompute: bool, top_k: int = 10) -> float:
|
|
searcher = LeannSearcher(index_path=index_path)
|
|
t0 = time.time()
|
|
_ = searcher.search(
|
|
"test document number 42",
|
|
top_k=top_k,
|
|
complexity=64,
|
|
prune_ratio=0.0,
|
|
recompute_embeddings=recompute,
|
|
)
|
|
return time.time() - t0
|
|
|
|
|
|
def main():
|
|
base = Path.cwd() / ".leann" / "indexes" / "bench"
|
|
base.parent.mkdir(parents=True, exist_ok=True)
|
|
index_path_recompute = str(base / "recompute.leann")
|
|
index_path_norecompute = str(base / "norecompute.leann")
|
|
|
|
# Build two variants: pruned (recompute) and non-compact (no-recompute)
|
|
ensure_index(index_path_recompute, is_recompute=True, is_compact=True)
|
|
ensure_index(index_path_norecompute, is_recompute=False, is_compact=False)
|
|
|
|
# Warm up
|
|
bench_once(index_path_recompute, recompute=True)
|
|
bench_once(index_path_norecompute, recompute=False)
|
|
|
|
t_recompute = bench_once(index_path_recompute, recompute=True)
|
|
t_norecompute = bench_once(index_path_norecompute, recompute=False)
|
|
|
|
# Compute sizes only for files belonging to each index prefix
|
|
def _size_for(prefix: str) -> int:
|
|
p = Path(prefix)
|
|
base = p.parent
|
|
stem = p.stem # e.g., 'recompute.leann'
|
|
total = 0
|
|
for f in base.iterdir():
|
|
if f.is_file() and f.name.startswith(stem):
|
|
total += f.stat().st_size
|
|
return total
|
|
|
|
size_recompute = _size_for(index_path_recompute)
|
|
size_norecompute = _size_for(index_path_norecompute)
|
|
|
|
print("Benchmark results (HNSW):")
|
|
print(
|
|
f" recompute=True: search_time={t_recompute:.3f}s, size={size_recompute / 1024 / 1024:.1f}MB"
|
|
)
|
|
print(
|
|
f" recompute=False: search_time={t_norecompute:.3f}s, size={size_norecompute / 1024 / 1024:.1f}MB"
|
|
)
|
|
print("Expectation: no-recompute should be faster but larger on disk.")
|
|
|
|
# DiskANN quick benchmark (final rerank vs no-recompute)
|
|
try:
|
|
index_path_diskann_nr = str(base / "diskann_nr.leann")
|
|
index_path_diskann_r = str(base / "diskann_r.leann")
|
|
|
|
# Build DiskANN no-recompute (keeps full disk index)
|
|
if not (
|
|
Path(index_path_diskann_nr).parent / f"{Path(index_path_diskann_nr).stem}.meta.json"
|
|
).exists():
|
|
b = LeannBuilder(
|
|
backend_name="diskann",
|
|
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
|
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
|
graph_degree=32,
|
|
complexity=64,
|
|
num_threads=4,
|
|
is_recompute=False,
|
|
)
|
|
for i in range(5000):
|
|
b.add_text(f"DiskANN NR test doc {i} for quick benchmark.")
|
|
b.build_index(index_path_diskann_nr)
|
|
|
|
# Build DiskANN recompute (enables partition; prunes redundant files)
|
|
if not (
|
|
Path(index_path_diskann_r).parent / f"{Path(index_path_diskann_r).stem}.meta.json"
|
|
).exists():
|
|
b = LeannBuilder(
|
|
backend_name="diskann",
|
|
embedding_model=os.getenv("LEANN_EMBED_MODEL", "facebook/contriever"),
|
|
embedding_mode=os.getenv("LEANN_EMBED_MODE", "sentence-transformers"),
|
|
graph_degree=32,
|
|
complexity=64,
|
|
num_threads=4,
|
|
is_recompute=True,
|
|
)
|
|
for i in range(5000):
|
|
b.add_text(f"DiskANN R test doc {i} for quick benchmark.")
|
|
b.build_index(index_path_diskann_r)
|
|
|
|
# Measure size per build prefix
|
|
def _size_for(prefix: str) -> int:
|
|
p = Path(prefix)
|
|
base_dir = p.parent
|
|
stem = p.stem
|
|
total = 0
|
|
for f in base_dir.iterdir():
|
|
if f.is_file() and f.name.startswith(stem):
|
|
total += f.stat().st_size
|
|
return total
|
|
|
|
size_diskann_nr = _size_for(index_path_diskann_nr)
|
|
size_diskann_r = _size_for(index_path_diskann_r)
|
|
|
|
# Speed on recompute-build (final rerank vs no-recompute)
|
|
s = LeannSearcher(index_path_diskann_r)
|
|
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
|
|
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
|
|
|
|
t0 = time.time()
|
|
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=False)
|
|
t_diskann_nr = time.time() - t0
|
|
|
|
t0 = time.time()
|
|
_ = s.search("DiskANN R test doc 123", top_k=10, complexity=64, recompute_embeddings=True)
|
|
t_diskann_r = time.time() - t0
|
|
|
|
print("\nBenchmark results (DiskANN):")
|
|
print(f" build(recompute=False): size={size_diskann_nr / 1024 / 1024:.1f}MB")
|
|
print(f" build(recompute=True, partition): size={size_diskann_r / 1024 / 1024:.1f}MB")
|
|
print(f" search recompute=False: {t_diskann_nr:.3f}s (on recompute-build)")
|
|
print(f" search recompute=True (final rerank): {t_diskann_r:.3f}s (on recompute-build)")
|
|
except Exception as e:
|
|
print(f"DiskANN quick benchmark skipped due to: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|