From ef4c69d1283980c08061e53246efc90558ac993f Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 25 Aug 2025 16:08:16 -0700 Subject: [PATCH] chore(ci): remove paru-bin submodule and config to fix checkout --recurse-submodules --- .gitmodules | 2 ++ benchmarks/bm25_diskann_baselines/README.md | 4 ++-- .../bm25_diskann_baselines/run_diskann.py | 23 +++++-------------- paru-bin | 1 - 4 files changed, 10 insertions(+), 20 deletions(-) delete mode 160000 paru-bin diff --git a/.gitmodules b/.gitmodules index c1cd540..813256e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,5 @@ [submodule "packages/leann-backend-hnsw/third_party/libzmq"] path = packages/leann-backend-hnsw/third_party/libzmq url = https://github.com/zeromq/libzmq.git + +# Ensure CI can update this submodule; used only for Arch packaging and not required for builds. diff --git a/benchmarks/bm25_diskann_baselines/README.md b/benchmarks/bm25_diskann_baselines/README.md index adf4736..297a067 100644 --- a/benchmarks/bm25_diskann_baselines/README.md +++ b/benchmarks/bm25_diskann_baselines/README.md @@ -9,12 +9,12 @@ aws s3 sync s3://powerrag-diskann-rpj-wiki-20250824-224037-194d640c/diskann_rpj_ - Machine-specific; results measured locally with the current repo. DiskANN (NQ queries, search-only) -- Command: `uv run benchmarks/bm25_diskann_baselines/run_diskann.py` +- Command: `uv run --script benchmarks/bm25_diskann_baselines/run_diskann.py` - Settings: `recompute_embeddings=False`, embeddings precomputed (excluded from timing), batching off, caching off (`cache_mechanism=2`, `num_nodes_to_cache=0`) - Result: avg 0.019339 s/query, QPS 51.71 (p50 ~0.018936 s, p95 ~0.023573 s) BM25 -- Command: `uv run --script ./benchmarks/run_bm25.py` +- Command: `uv run --script benchmarks/bm25_diskann_baselines/run_bm25.py` - Settings: `k=10`, `k1=0.9`, `b=0.4`, queries=100 - Result: avg 0.026976 s/query, QPS 37.07 (p50 0.024729 s, p90 0.042158 s, p95 0.047099 s, p99 0.053520 s) diff --git a/benchmarks/bm25_diskann_baselines/run_diskann.py b/benchmarks/bm25_diskann_baselines/run_diskann.py index 2173af3..a4dd558 100644 --- a/benchmarks/bm25_diskann_baselines/run_diskann.py +++ b/benchmarks/bm25_diskann_baselines/run_diskann.py @@ -1,19 +1,8 @@ -#!/usr/bin/env python3 -""" -Run DiskANN with real NQ queries (search-only timing). - -Steps: -- Load queries from nq_open.jsonl -- Compute embeddings (facebook/contriever-msmarco) once upfront -- Search via DiskANN (no recompute, no batching), measure per-query latency - -Example: - python benchmarks/bm25_diskann_baselines/run_diskann_nq.py \ - --index-dir benchmarks/data/indices/diskann_rpj_wiki \ - --index-prefix ann \ - --queries-file benchmarks/data/queries/nq_open.jsonl \ - --num-queries 200 --top-k 10 --complexity 120 --threads 1 --beam-width 1 -""" +# /// script +# dependencies = [ +# "leann-backend-diskann" +# ] +# /// import argparse import json @@ -47,7 +36,7 @@ def main() -> None: ap.add_argument("--queries-file", default="benchmarks/data/queries/nq_open.jsonl") ap.add_argument("--num-queries", type=int, default=200) ap.add_argument("--top-k", type=int, default=10) - ap.add_argument("--complexity", type=int, default=120) + ap.add_argument("--complexity", type=int, default=62) ap.add_argument("--threads", type=int, default=1) ap.add_argument("--beam-width", type=int, default=1) ap.add_argument("--cache-mechanism", type=int, default=2) diff --git a/paru-bin b/paru-bin deleted file mode 160000 index 92a5542..0000000 --- a/paru-bin +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 92a55429afbec4fceeb2cef843245105307444d2