Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/README.md
@@ -0,0 +1,20 @@
+# Benchmark of IVF variants
+
+This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
+The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
+
+
+The code is organized as: 
+
+- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
+
+- `bench_all_ivf.py`: evaluate one type of inverted file
+
+- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
+Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
+
+- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
+
+The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
+
+It was run in October 2018 for the results in the wiki. 
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@@ -0,0 +1,567 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import sys
+import time
+import json
+
+import faiss
+import numpy as np
+
+try:
+    import datasets_fb as datasets
+except ModuleNotFoundError:
+    import datasets_oss as datasets
+
+sanitize = datasets.sanitize
+
+
+
+def unwind_index_ivf(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        assert index.chain.size() == 1
+        vt = index.chain.at(0)
+        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
+        assert vt2 is None
+        if vt is None:
+            vt = lambda x: x
+        else:
+            vt = faiss.downcast_VectorTransform(vt)
+        return index_ivf, vt
+    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
+        return unwind_index_ivf(faiss.downcast_index(index.base_index))
+    if isinstance(index, faiss.IndexIVF):
+        return index, None
+    else:
+        return None, None
+
+
+def apply_AQ_options(index, args):
+    # if not(
+    #    isinstance(index, faiss.IndexAdditiveQuantize) or
+    #    isinstance(index, faiss.IndexIVFAdditiveQuantizer)):
+    #    return
+    if args.RQ_train_default:
+        print("set default training for RQ")
+        index.rq.train_type
+        index.rq.train_type = faiss.ResidualQuantizer.Train_default
+    if args.RQ_beam_size != -1:
+        print("set RQ beam size to", args.RQ_beam_size)
+        index.rq.max_beam_size
+        index.rq.max_beam_size = args.RQ_beam_size
+    if args.LSQ_encode_ils_iters != -1:
+        print("set LSQ ils iterations to", args.LSQ_encode_ils_iters)
+        index.lsq.encode_ils_iters
+        index.lsq.encode_ils_iters = args.LSQ_encode_ils_iters
+    if args.RQ_use_beam_LUT != -1:
+        print("set RQ beam LUT to", args.RQ_use_beam_LUT)
+        index.rq.use_beam_LUT
+        index.rq.use_beam_LUT = args.RQ_use_beam_LUT
+
+
+
+def eval_setting(index, xq, gt, k, inter, min_time):
+    """ evaluate searching in terms of precision vs. speed """
+    nq = xq.shape[0]
+    ivf_stats = faiss.cvar.indexIVF_stats
+    ivf_stats.reset()
+    nrun = 0
+    t0 = time.time()
+    while True:
+        D, I = index.search(xq, k)
+        nrun += 1
+        t1 = time.time()
+        if t1 - t0 > min_time:
+            break
+    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
+    res = {
+        "ms_per_query": ms_per_query,
+        "nrun": nrun
+    }
+    res["n"] = ms_per_query
+    if inter:
+        rank = k
+        inter_measure = faiss.eval_intersection(gt[:, :rank], I[:, :rank]) / (nq * rank)
+        print("%.4f" % inter_measure, end=' ')
+        res["inter_measure"] = inter_measure
+    else:
+        res["recalls"] = {}
+        for rank in 1, 10, 100:
+            recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+            print("%.4f" % recall, end=' ')
+            res["recalls"][rank] = recall
+    print("   %9.5f  " % ms_per_query, end=' ')
+    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
+    print(nrun)
+    res["ndis"] = ivf_stats.ndis / nrun
+    return res
+
+######################################################
+# Training
+######################################################
+
+def run_train(args, ds, res):
+    nq, d = ds.nq, ds.d
+    nb, d = ds.nq, ds.d
+
+    print("build index, key=", args.indexkey)
+
+    index = faiss.index_factory(
+        d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else
+        faiss.METRIC_INNER_PRODUCT
+    )
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    if args.by_residual != -1:
+        by_residual = args.by_residual == 1
+        print("setting by_residual = ", by_residual)
+        index_ivf.by_residual   # check if field exists
+        index_ivf.by_residual = by_residual
+
+    if index_ivf:
+        print("Update add-time parameters")
+        # adjust default parameters used at add time for quantizers
+        # because otherwise the assignment is inaccurate
+        quantizer = faiss.downcast_index(index_ivf.quantizer)
+        if isinstance(quantizer, faiss.IndexRefine):
+            print("   update quantizer k_factor=", quantizer.k_factor, end=" -> ")
+            quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
+            print(quantizer.k_factor)
+            base_index = faiss.downcast_index(quantizer.base_index)
+            if isinstance(base_index, faiss.IndexIVF):
+                print("   update quantizer nprobe=", base_index.nprobe, end=" -> ")
+                base_index.nprobe = (
+                    16 if base_index.nlist < 1e5 else
+                    32 if base_index.nlist < 4e6 else
+                    64)
+                print(base_index.nprobe)
+        elif isinstance(quantizer, faiss.IndexHNSW):
+            hnsw = quantizer.hnsw
+            print(
+                f"   update HNSW quantizer options, before: "
+                f"{hnsw.efSearch=:} {hnsw.efConstruction=:}"
+            )
+            hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
+            hnsw.efConstruction = 200
+            print(f"       after: {hnsw.efSearch=:} {hnsw.efConstruction=:}")
+
+    apply_AQ_options(index_ivf or index, args)
+
+    if index_ivf:
+        index_ivf.verbose = True
+        index_ivf.quantizer.verbose = True
+        index_ivf.cp.verbose = True
+    else:
+        index.verbose = True
+
+    maxtrain = args.maxtrain
+    if maxtrain == 0:
+        if 'IMI' in args.indexkey:
+            maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
+        elif index_ivf:
+            maxtrain = 50 * index_ivf.nlist
+        else:
+            # just guess...
+            maxtrain = 256 * 100
+        maxtrain = max(maxtrain, 256 * 100)
+        print("setting maxtrain to %d" % maxtrain)
+
+    try:
+        xt2 = ds.get_train(maxtrain=maxtrain)
+    except NotImplementedError:
+        print("No training set: training on database")
+        xt2 = ds.get_database()[:maxtrain]
+
+    print("train, size", xt2.shape)
+    assert np.all(np.isfinite(xt2))
+
+    if (isinstance(vec_transform, faiss.OPQMatrix) and
+        isinstance(index_ivf, faiss.IndexIVFPQFastScan)):
+        print("  Forcing OPQ training PQ to PQ4")
+        ref_pq = index_ivf.pq
+        training_pq = faiss.ProductQuantizer(
+            ref_pq.d, ref_pq.M, ref_pq.nbits
+        )
+        vec_transform.pq
+        vec_transform.pq = training_pq
+
+
+    if args.get_centroids_from == '':
+
+        if args.clustering_niter >= 0:
+            print(("setting nb of clustering iterations to %d" %
+                   args.clustering_niter))
+            index_ivf.cp.niter = args.clustering_niter
+
+        if args.train_on_gpu:
+            print("add a training index on GPU")
+            train_index = faiss.index_cpu_to_all_gpus(
+                    faiss.IndexFlatL2(index_ivf.d))
+            index_ivf.clustering_index = train_index
+
+    else:
+        print("Getting centroids from", args.get_centroids_from)
+        src_index = faiss.read_index(args.get_centroids_from)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = src_quant.reconstruct_n()
+        print("  centroid table shape", centroids.shape)
+
+        if isinstance(vec_transform, faiss.VectorTransform):
+            print("  training vector transform")
+            vec_transform.train(xt2)
+            print("  transform centroids")
+            centroids = vec_transform.apply_py(centroids)
+
+        if not index_ivf.quantizer.is_trained:
+            print("  training quantizer")
+            index_ivf.quantizer.train(centroids)
+
+        print("  add centroids to quantizer")
+        index_ivf.quantizer.add(centroids)
+        del src_index
+
+    t0 = time.time()
+    index.train(xt2)
+    res.train_time = time.time() - t0
+    print("  train in %.3f s" % res.train_time)
+    return index
+
+######################################################
+# Populating index
+######################################################
+
+def run_add(args, ds, index, res):
+
+    print("adding")
+    t0 = time.time()
+    if args.add_bs == -1:
+        assert args.split == [1, 0], "split not supported with full batch add"
+        index.add(sanitize(ds.get_database()))
+    else:
+        totn = ds.nb // args.split[0] # approximate
+        i0 = 0
+        print(f"Adding in block sizes {args.add_bs} with split {args.split}")
+        for xblock in ds.database_iterator(bs=args.add_bs, split=args.split):
+            i1 = i0 + len(xblock)
+            print("  adding %d:%d / %d [%.3f s, RSS %d kiB] " % (
+                i0, i1, totn, time.time() - t0,
+                faiss.get_mem_usage_kb()))
+            index.add(xblock)
+            i0 = i1
+
+    res.t_add = time.time() - t0
+    print(f"  add in {res.t_add:.3f} s index size {index.ntotal}")
+
+
+######################################################
+# Search
+######################################################
+
+def run_search(args, ds, index, res):
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    if args.no_precomputed_tables:
+        if isinstance(index_ivf, faiss.IndexIVFPQ):
+            print("disabling precomputed table")
+            index_ivf.use_precomputed_table = -1
+            index_ivf.precomputed_table.clear()
+
+    if args.indexfile:
+        print("index size on disk: ", os.stat(args.indexfile).st_size)
+
+    if hasattr(index, "code_size"):
+        print("vector code_size", index.code_size)
+
+    if hasattr(index_ivf, "code_size"):
+        print("vector code_size (IVF)", index_ivf.code_size)
+
+    print("current RSS:", faiss.get_mem_usage_kb() * 1024)
+
+    precomputed_table_size = 0
+    if hasattr(index_ivf, 'precomputed_table'):
+        precomputed_table_size = index_ivf.precomputed_table.size() * 4
+
+    print("precomputed tables size:", precomputed_table_size)
+
+    # Index is ready
+
+    xq = sanitize(ds.get_queries())
+    nq, d = xq.shape
+    gt = ds.get_groundtruth(k=args.k)
+
+    if not args.accept_short_gt: # Deep1B has only a single NN per query
+        assert gt.shape[1] == args.k
+
+    if args.searchthreads != -1:
+        print("Setting nb of threads to", args.searchthreads)
+        faiss.omp_set_num_threads(args.searchthreads)
+    else:
+        print("nb search threads: ", faiss.omp_get_max_threads())
+
+    ps = faiss.ParameterSpace()
+    ps.initialize(index)
+
+    parametersets = args.searchparams
+
+    if args.inter:
+        header = (
+            '%-40s     inter@%3d time(ms/q)   nb distances #runs' %
+            ("parameters", args.k)
+        )
+    else:
+
+        header = (
+            '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' %
+            "parameters"
+        )
+
+
+    res.search_results = {}
+    if parametersets == ['autotune']:
+
+        ps.n_experiments = args.n_autotune
+        ps.min_test_duration = args.min_test_duration
+
+        for kv in args.autotune_max:
+            k, vmax = kv.split(':')
+            vmax = float(vmax)
+            print("limiting %s to %g" % (k, vmax))
+            pr = ps.add_range(k)
+            values = faiss.vector_to_array(pr.values)
+            values = np.array([v for v in values if v < vmax])
+            faiss.copy_array_to_vector(values, pr.values)
+
+        for kv in args.autotune_range:
+            k, vals = kv.split(':')
+            vals = np.fromstring(vals, sep=',')
+            print("setting %s to %s" % (k, vals))
+            pr = ps.add_range(k)
+            faiss.copy_array_to_vector(vals, pr.values)
+
+        # setup the Criterion object
+        if args.inter:
+            print("Optimize for intersection @ ", args.k)
+            crit = faiss.IntersectionCriterion(nq, args.k)
+        else:
+            print("Optimize for 1-recall @ 1")
+            crit = faiss.OneRecallAtRCriterion(nq, 1)
+
+        # by default, the criterion will request only 1 NN
+        crit.nnn = args.k
+        crit.set_groundtruth(None, gt.astype('int64'))
+
+        # then we let Faiss find the optimal parameters by itself
+        print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
+        ps.display()
+
+        t0 = time.time()
+        op = ps.explore(index, xq, crit)
+        res.t_explore = time.time() - t0
+        print("Done in %.3f s, available OPs:" % res.t_explore)
+
+        op.display()
+
+        print("Re-running evaluation on selected OPs")
+        print(header)
+        opv = op.optimal_pts
+        maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
+        for i in range(opv.size()):
+            opt = opv.at(i)
+
+            ps.set_index_parameters(index, opt.key)
+
+            print(opt.key.ljust(maxw), end=' ')
+            sys.stdout.flush()
+
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[opt.key] = res_i
+
+    else:
+        print(header)
+        for param in parametersets:
+            print("%-40s " % param, end=' ')
+            sys.stdout.flush()
+            ps.set_index_parameters(index, param)
+
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[param] = res_i
+
+
+
+######################################################
+# Driver function
+######################################################
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('general options')
+    aa('--nthreads', default=-1, type=int,
+        help='nb of threads to use at train and add time')
+    aa('--json', default=False, action="store_true",
+        help="output stats in JSON format at the end")
+    aa('--todo', default=["check_files"],
+       choices=["train", "add", "search", "check_files"],
+       nargs="+", help='what to do (check_files means decide depending on which index files exist)')
+
+    group = parser.add_argument_group('dataset options')
+    aa('--db', default='deep1M', help='dataset')
+    aa('--compute_gt', default=False, action='store_true',
+        help='compute and store the groundtruth')
+    aa('--force_IP', default=False, action="store_true",
+        help='force IP search instead of L2')
+    aa('--accept_short_gt', default=False, action='store_true',
+        help='work around a problem with Deep1B GT')
+
+    group = parser.add_argument_group('index construction')
+    aa('--indexkey', default='HNSW32', help='index_factory type')
+    aa('--trained_indexfile', default='',
+       help='file to read or write a trained index from')
+    aa('--maxtrain', default=256 * 256, type=int,
+        help='maximum number of training points (0 to set automatically)')
+    aa('--indexfile', default='', help='file to read or write index from')
+    aa('--split', default=[1, 0], type=int, nargs=2, help="database split")
+    aa('--add_bs', default=-1, type=int,
+        help='add elements index by batches of this size')
+
+    group = parser.add_argument_group('IVF options')
+    aa('--by_residual', default=-1, type=int,
+        help="set if index should use residuals (default=unchanged)")
+    aa('--no_precomputed_tables', action='store_true', default=False,
+        help='disable precomputed tables (uses less memory)')
+    aa('--get_centroids_from', default='',
+        help='get the centroids from this index (to speed up training)')
+    aa('--clustering_niter', default=-1, type=int,
+        help='number of clustering iterations (-1 = leave default)')
+    aa('--train_on_gpu', default=False, action='store_true',
+        help='do training on GPU')
+
+    group = parser.add_argument_group('index-specific options')
+    aa('--M0', default=-1, type=int, help='size of base level for HNSW')
+    aa('--RQ_train_default', default=False, action="store_true",
+        help='disable progressive dim training for RQ')
+    aa('--RQ_beam_size', default=-1, type=int,
+        help='set beam size at add time')
+    aa('--LSQ_encode_ils_iters', default=-1, type=int,
+        help='ILS iterations for LSQ')
+    aa('--RQ_use_beam_LUT', default=-1, type=int,
+        help='use beam LUT at add time')
+
+    group = parser.add_argument_group('searching')
+    aa('--k', default=100, type=int, help='nb of nearest neighbors')
+    aa('--inter', default=False, action='store_true',
+        help='use intersection measure instead of 1-recall as metric')
+    aa('--searchthreads', default=-1, type=int,
+        help='nb of threads to use at search time')
+    aa('--searchparams', nargs='+', default=['autotune'],
+        help="search parameters to use (can be autotune or a list of params)")
+    aa('--n_autotune', default=500, type=int,
+        help="max nb of autotune experiments")
+    aa('--autotune_max', default=[], nargs='*',
+        help='set max value for autotune variables format "var:val" (exclusive)')
+    aa('--autotune_range', default=[], nargs='*',
+        help='set complete autotune range, format "var:val1,val2,..."')
+    aa('--min_test_duration', default=3.0, type=float,
+        help='run test at least for so long to avoid jitter')
+    aa('--indexes_to_merge', default=[], nargs="*",
+        help="load these indexes to search and merge them before searching")
+
+    args = parser.parse_args()
+
+    if args.todo == ["check_files"]:
+        if os.path.exists(args.indexfile):
+            args.todo = ["search"]
+        elif os.path.exists(args.trained_indexfile):
+            args.todo = ["add", "search"]
+        else:
+            args.todo = ["train", "add", "search"]
+        print("setting todo to", args.todo)
+
+    print("args:", args)
+
+    os.system('echo -n "nb processors "; '
+            'cat /proc/cpuinfo | grep ^processor | wc -l; '
+            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+    # object to collect results
+    res = argparse.Namespace()
+    res.args = args.__dict__
+
+    res.cpu_model = [
+        l for l in open("/proc/cpuinfo", "r")
+        if "model name" in l][0]
+
+    print("Load dataset")
+
+    ds = datasets.load_dataset(
+        dataset=args.db, compute_gt=args.compute_gt)
+
+    if args.force_IP:
+        ds.metric = "IP"
+
+    print(ds)
+
+    if args.nthreads != -1:
+        print("Set nb of threads to", args.nthreads)
+        faiss.omp_set_num_threads(args.nthreads)
+    else:
+        print("nb threads: ", faiss.omp_get_max_threads())
+
+    index = None
+    if "train" in args.todo:
+        print("================== Training index")
+        index = run_train(args, ds, res)
+        if args.trained_indexfile:
+            print("storing trained index", args.trained_indexfile)
+            faiss.write_index(index, args.trained_indexfile)
+
+    if "add" in args.todo:
+        if not index:
+            assert args.trained_indexfile
+            print("reading trained index", args.trained_indexfile)
+            index = faiss.read_index(args.trained_indexfile)
+
+        print("================== Adding vectors to index")
+        run_add(args, ds, index, res)
+        if args.indexfile:
+            print("storing", args.indexfile)
+            faiss.write_index(index, args.indexfile)
+
+    if "search" in args.todo:
+        if not index:
+            if args.indexfile:
+                print("reading index", args.indexfile)
+                index = faiss.read_index(args.indexfile)
+            elif args.indexes_to_merge:
+                print(f"Merging {len(args.indexes_to_merge)} indexes")
+                sz = 0
+                for fname in args.indexes_to_merge:
+                    print(f"    reading {fname} (current size {sz})")
+                    index_i = faiss.read_index(fname)
+                    if index is None:
+                        index = index_i
+                    else:
+                        index.merge_from(index_i, index.ntotal)
+                    sz = index.ntotal
+            else:
+                assert False, "provide --indexfile"
+
+        print("================== Searching")
+        run_search(args, ds, index, res)
+
+    if args.json:
+        print("JSON results:", json.dumps(res.__dict__))
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/bench_kmeans.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--nt', default=65536, type=int)
+aa('--nb', default=100000, type=int)
+aa('--nt_sample', default=0, type=int)
+
+group = parser.add_argument_group('kmeans options')
+aa('--k', default=256, type=int)
+aa('--seed', default=12345, type=int)
+aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
+aa('--niter', default=25, type=int)
+aa('--eval_freq', default=100, type=int)
+
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+ngpu = faiss.get_num_gpus()
+print("nb GPUs:", ngpu)
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(dataset=args.db)
+
+
+if args.nt_sample == 0:
+    xt_pca = xt[args.nt:args.nt + 10000]
+    xt = xt[:args.nt]
+else:
+    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
+    rs = np.random.RandomState(args.seed)
+    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
+    xt = xt[idx]
+
+xb = xb[:args.nb]
+
+d = xb.shape[1]
+
+if args.pcadim != -1:
+    print("training PCA: %d -> %d" % (d, args.pcadim))
+    pca = faiss.PCAMatrix(d, args.pcadim)
+    pca.train(sanitize(xt_pca))
+    xt = pca.apply_py(sanitize(xt))
+    xb = pca.apply_py(sanitize(xb))
+    d = xb.shape[1]
+
+
+######################################################
+# Run clustering
+######################################################
+
+
+index = faiss.IndexFlatL2(d)
+
+if ngpu > 0:
+    print("moving index to GPU")
+    index = faiss.index_cpu_to_all_gpus(index)
+
+
+clustering = faiss.Clustering(d, args.k)
+
+clustering.verbose = True
+clustering.seed = args.seed
+clustering.max_points_per_centroid = 10**6
+clustering.min_points_per_centroid = 1
+
+centroids = None
+
+for iter0 in range(0, args.niter, args.eval_freq):
+    iter1 = min(args.niter, iter0 + args.eval_freq)
+    clustering.niter = iter1 - iter0
+
+    if iter0 > 0:
+        faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
+
+    clustering.train(sanitize(xt), index)
+    index.reset()
+    centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
+    index.add(centroids)
+
+    _, I = index.search(sanitize(xb), 1)
+
+    error = ((xb - centroids[I.ravel()]) ** 2).sum()
+
+    print("iter1=%d quantization error on test: %.4f" % (iter1, error))
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/cmp_with_scann.py
@@ -0,0 +1,307 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import sys
+import os
+import argparse
+
+import numpy as np
+
+
+def eval_recalls(name, I, gt, times):
+    k = I.shape[1]
+    s = "%-40s recall" % name
+    nq = len(gt)
+    for rank in 1, 10, 100, 1000:
+        if rank > k:
+            break
+        recall = (I[:, :rank] == gt[:, :1]).sum() / nq
+        s += "@%d: %.4f " % (rank, recall)
+    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
+    print(s)
+
+def eval_inters(name, I, gt, times):
+    k = I.shape[1]
+    s = "%-40s inter" % name
+    nq = len(gt)
+    for rank in 1, 10, 100, 1000:
+        if rank > k:
+            break
+        ninter = 0
+        for i in range(nq):
+            ninter += np.intersect1d(I[i, :rank], gt[i, :rank]).size
+        inter = ninter / (nq * rank)
+        s += "@%d: %.4f " % (rank, inter)
+    s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
+    print(s)
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('dataset options')
+
+    aa('--db', default='deep1M', help='dataset')
+    aa('--measure', default="1-recall",
+        help="perf measure to use: 1-recall or inter")
+    aa('--download', default=False, action="store_true")
+    aa('--lib', default='faiss', help='library to use (faiss or scann)')
+    aa('--thenscann', default=False, action="store_true")
+    aa('--base_dir', default='/checkpoint/matthijs/faiss_improvements/cmp_ivf_scan_2')
+
+    group = parser.add_argument_group('searching')
+    aa('--k', default=10, type=int, help='nb of nearest neighbors')
+    aa('--pre_reorder_k', default="0,10,100,1000", help='values for reorder_k')
+    aa('--nprobe', default="1,2,5,10,20,50,100,200", help='values for nprobe')
+    aa('--nrun', default=5, type=int, help='nb of runs to perform')
+    args = parser.parse_args()
+
+    print("args:", args)
+    pre_reorder_k_tab = [int(x) for x in args.pre_reorder_k.split(',')]
+    nprobe_tab = [int(x) for x in args.nprobe.split(',')]
+
+    os.system('echo -n "nb processors "; '
+            'cat /proc/cpuinfo | grep ^processor | wc -l; '
+            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+    cache_dir = args.base_dir + "/" + args.db + "/"
+    k = args.k
+    nrun = args.nrun
+
+    if not os.path.exists(cache_dir + "xb.npy"):
+        # prepare cache
+        from datasets import load_dataset
+        ds = load_dataset(args.db, download=args.download)
+        print(ds)
+        # store for SCANN
+        os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
+        tosave = dict(
+            xb = ds.get_database(),
+            xq = ds.get_queries(),
+            gt = ds.get_groundtruth()
+        )
+        for name, v in tosave.items():
+            fname = cache_dir + "/" + name + ".npy"
+            print("save", fname)
+            np.save(fname, v)
+
+        open(cache_dir + "metric", "w").write(ds.metric)
+        
+    dataset = {}
+    for kn in "xb xq gt".split():
+        fname = cache_dir + "/" + kn + ".npy"
+        print("load", fname)
+        dataset[kn] = np.load(fname)
+    xb = dataset["xb"]
+    xq = dataset["xq"]
+    gt = dataset["gt"] 
+    distance_measure = open(cache_dir + "metric").read()
+    
+    if args.lib == "faiss":
+        import faiss
+
+        name1_to_metric = {
+            "IP": faiss.METRIC_INNER_PRODUCT,
+            "L2": faiss.METRIC_L2
+        }
+
+        index_fname = cache_dir + "index.faiss"
+        if not os.path.exists(index_fname):
+            index = faiss_make_index(
+                xb, name1_to_metric[distance_measure], index_fname)
+        else:
+            index = faiss.read_index(index_fname)
+
+        faiss_eval_search(
+                index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+                nrun, args.measure
+        )
+
+    if args.lib == "scann":
+        from scann.scann_ops.py import scann_ops_pybind
+
+        name1_to_name2 = {
+            "IP": "dot_product",
+            "L2": "squared_l2"
+        }
+
+        scann_dir = cache_dir + "/scann1.1.1_serialized"
+        if os.path.exists(scann_dir + "/scann_config.pb"):
+            searcher = scann_ops_pybind.load_searcher(scann_dir)
+        else:
+            searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
+
+        scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
+        if os.path.exists(scann_dir + "/scann_config.pb"):
+            searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
+        else:
+            searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
+
+        scann_eval_search(
+            searcher, searcher_reo,
+            xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+            nrun, args.measure
+        )
+
+    if args.lib != "scann" and args.thenscann:
+        # just append --lib scann, that will override the previous cmdline
+        # options
+        cmdline = " ".join(sys.argv) + " --lib scann"
+        cmdline = (
+            ". ~/anaconda3/etc/profile.d/conda.sh ; " +
+            "conda activate scann_1.1.1; "
+            "python -u " + cmdline)
+
+        print("running", cmdline)
+
+        os.system(cmdline)
+
+
+###############################################################
+# SCANN
+###############################################################
+
+def scann_make_index(xb, distance_measure, scann_dir, reorder_k):
+    import scann
+
+    print("build index")
+
+    if distance_measure == "dot_product":
+        thr = 0.2
+    else:
+        thr = 0
+    k = 10
+    sb = scann.scann_ops_pybind.builder(xb, k, distance_measure)
+    sb = sb.tree(num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000)
+    sb = sb.score_ah(2, anisotropic_quantization_threshold=thr)
+
+    if reorder_k > 0:
+        sb = sb.reorder(reorder_k)
+
+    searcher = sb.build()
+
+    print("done")
+
+    print("write index to", scann_dir)
+
+    os.system(f"rm -rf {scann_dir}; mkdir -p {scann_dir}")
+    # os.mkdir(scann_dir)
+    searcher.serialize(scann_dir)
+    return searcher
+
+def scann_eval_search(
+        searcher, searcher_reo,
+        xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
+        nrun, measure):
+
+    # warmup
+    for _run in range(5):
+        searcher.search_batched(xq)
+
+    for nprobe in nprobe_tab:
+
+        for pre_reorder_k in pre_reorder_k_tab:
+
+            times = []
+            for _run in range(nrun):
+                if pre_reorder_k == 0:
+                    t0 = time.time()
+                    I, D = searcher.search_batched(
+                        xq, leaves_to_search=nprobe, final_num_neighbors=k
+                    )
+                    t1 = time.time()
+                else:
+                    t0 = time.time()
+                    I, D = searcher_reo.search_batched(
+                        xq, leaves_to_search=nprobe, final_num_neighbors=k,
+                        pre_reorder_num_neighbors=pre_reorder_k
+                    )
+                    t1 = time.time()
+
+                times.append(t1 - t0)
+            header = "SCANN nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
+            if measure == "1-recall":
+                eval_recalls(header, I, gt, times)
+            else:
+                eval_inters(header, I, gt, times)
+
+
+
+
+###############################################################
+# Faiss
+###############################################################
+
+
+def faiss_make_index(xb, metric_type, fname):
+    import faiss
+
+    d = xb.shape[1]
+    M = d // 2
+    index = faiss.index_factory(d, f"IVF2000,PQ{M}x4fs", metric_type)
+    # if not by_residual:
+    #    print("setting no residual")
+    #    index.by_residual = False
+
+    print("train")
+    index.train(xb[:250000])
+    print("add")
+    index.add(xb)
+    print("write index", fname)
+    faiss.write_index(index, fname)
+
+    return index
+
+def faiss_eval_search(
+            index, xq, xb, nprobe_tab, pre_reorder_k_tab,
+            k, gt, nrun, measure
+    ):
+    import faiss
+
+    print("use precomputed table=", index.use_precomputed_table,
+          "by residual=", index.by_residual)
+
+    print("adding a refine index")
+    index_refine = faiss.IndexRefineFlat(index, faiss.swig_ptr(xb))
+
+    print("set single thread")
+    faiss.omp_set_num_threads(1)
+
+    print("warmup")
+    for _run in range(5):
+        index.search(xq, k)
+
+    print("run timing")
+    for nprobe in nprobe_tab:
+        for pre_reorder_k in pre_reorder_k_tab:
+            index.nprobe = nprobe
+            times = []
+            for _run in range(nrun):
+                if pre_reorder_k == 0:
+                    t0 = time.time()
+                    D, I = index.search(xq, k)
+                    t1 = time.time()
+                else:
+                    index_refine.k_factor = pre_reorder_k / k
+                    t0 = time.time()
+                    D, I = index_refine.search(xq, k)
+                    t1 = time.time()
+
+                times.append(t1 - t0)
+
+            header = "Faiss nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
+            if measure == "1-recall":
+                eval_recalls(header, I, gt, times)
+            else:
+                eval_inters(header, I, gt, times)
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/datasets_oss.py
@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+import time
+import numpy as np
+import faiss
+
+from faiss.contrib import datasets as faiss_datasets
+
+print("path:", faiss_datasets.__file__)
+
+faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+#################################################################
+# Dataset
+#################################################################
+
+class DatasetCentroids(faiss_datasets.Dataset):
+
+    def __init__(self, ds, indexfile):
+        self.d = ds.d
+        self.metric = ds.metric
+        self.nq = ds.nq
+        self.xq = ds.get_queries()
+
+        # get the xb set
+        src_index = faiss.read_index(indexfile)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = faiss.vector_to_array(src_quant.xb)
+        self.xb = centroids.reshape(-1, self.d)
+        self.nb = self.nt = len(self.xb)
+
+    def get_queries(self):
+        return self.xq
+
+    def get_database(self):
+        return self.xb
+
+    def get_train(self, maxtrain=None):
+        return self.xb
+
+    def get_groundtruth(self, k=100):
+        return faiss.knn(
+            self.xq, self.xb, k,
+            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
+        )[1]
+
+
+
+
+
+
+def load_dataset(dataset='deep1M', compute_gt=False, download=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        return faiss_datasets.DatasetSIFT1M()
+
+    elif dataset.startswith('bigann'):
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+
+        return faiss_datasets.DatasetBigANN(nb_M=dbsize)
+
+    elif dataset.startswith("deep_centroids_"):
+        ncent = int(dataset[len("deep_centroids_"):])
+        centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
+        return DatasetCentroids(
+            faiss_datasets.DatasetDeep1B(nb=1000000),
+            f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
+        )
+
+    elif dataset.startswith("deep"):
+
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+        return faiss_datasets.DatasetDeep1B(nb=dbsize)
+
+    elif dataset == "music-100":
+        return faiss_datasets.DatasetMusic100()
+
+    elif dataset == "glove":
+        return faiss_datasets.DatasetGlove(download=download)
+
+    else:
+        assert False
+
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/make_groundtruth.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+# https://stackoverflow.com/questions/7016056/python-logging-not-outputting-anything
+logging.basicConfig()
+logger = logging.getLogger('faiss.contrib.exhaustive_search')
+logger.setLevel(logging.INFO)
+
+from faiss.contrib import datasets
+from faiss.contrib.exhaustive_search import knn_ground_truth
+from faiss.contrib import vecs_io
+
+ds = datasets.DatasetDeep1B(nb=int(1e9))
+
+print("computing GT matches for", ds)
+
+D, I = knn_ground_truth(
+    ds.get_queries(),
+    ds.database_iterator(bs=65536),
+    k=100
+)
+
+vecs_io.ivecs_write("/tmp/tt.ivecs", I)
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
@@ -0,0 +1,502 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+from collections import defaultdict
+from matplotlib import pyplot
+
+import re
+
+from argparse import Namespace
+
+from faiss.contrib.factory_tools import get_code_size as unitsize
+
+
+def dbsize_from_name(dbname):
+    sufs = {
+        '1B': 10**9,
+        '100M': 10**8,
+        '10M': 10**7,
+        '1M': 10**6,
+    }
+    for s in sufs:
+        if dbname.endswith(s):
+            return sufs[s]
+    else:
+        assert False
+
+
+def keep_latest_stdout(fnames):
+    fnames = [fname for fname in fnames if fname.endswith('.stdout')]
+    fnames.sort()
+    n = len(fnames)
+    fnames2 = []
+    for i, fname in enumerate(fnames):
+        if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
+            continue
+        fnames2.append(fname)
+    return fnames2
+
+
+def parse_result_file(fname):
+    # print fname
+    st = 0
+    res = []
+    keys = []
+    stats = {}
+    stats['run_version'] = fname[-8]
+    indexkey = None
+    for l in open(fname):
+        if l.startswith("srun:"):
+            # looks like a crash...
+            if indexkey is None:
+                raise RuntimeError("instant crash")
+            break
+        elif st == 0:
+            if l.startswith("dataset in dimension"):
+                fi = l.split()
+                stats["d"] = int(fi[3][:-1])
+                stats["nq"] = int(fi[9])
+                stats["nb"] = int(fi[11])
+                stats["nt"] = int(fi[13])
+            if l.startswith('index size on disk:'):
+                stats['index_size'] = int(l.split()[-1])
+            if l.startswith('current RSS:'):
+                stats['RSS'] = int(l.split()[-1])
+            if l.startswith('precomputed tables size:'):
+                stats['tables_size'] = int(l.split()[-1])
+            if l.startswith('Setting nb of threads to'):
+                stats['n_threads'] = int(l.split()[-1])
+            if l.startswith('  add in'):
+                stats['add_time'] = float(l.split()[-2])
+            if l.startswith("vector code_size"):
+                stats['code_size'] = float(l.split()[-1])
+            if l.startswith('args:'):
+                args = eval(l[l.find(' '):])
+                indexkey = args.indexkey
+            elif "time(ms/q)" in l:
+                # result header
+                if 'R@1   R@10  R@100' in l:
+                    stats["measure"] = "recall"
+                    stats["ranks"] = [1, 10, 100]
+                elif 'I@1   I@10  I@100' in l:
+                    stats["measure"] = "inter"
+                    stats["ranks"] = [1, 10, 100]
+                elif 'inter@' in l:
+                    stats["measure"] = "inter"
+                    fi = l.split()
+                    if fi[1] == "inter@":
+                        rank = int(fi[2])
+                    else:
+                        rank = int(fi[1][len("inter@"):])
+                    stats["ranks"] = [rank]
+
+                else:
+                    assert False
+                st = 1
+            elif 'index size on disk:' in l:
+                stats["index_size"] = int(l.split()[-1])
+        elif st == 1:
+            st = 2
+        elif st == 2:
+            fi = l.split()
+            if l[0] == " ":
+                # means there are 0 parameters
+                fi = [""] + fi
+            keys.append(fi[0])
+            res.append([float(x) for x in fi[1:]])
+    return indexkey, np.array(res), keys, stats
+
+# the directory used in run_on_cluster.bash
+basedir = "/checkpoint/matthijs/bench_all_ivf/"
+logdir = basedir + 'logs/'
+
+
+def collect_results_for(db='deep1M', prefix="autotune."):
+    # run parsing
+    allres = {}
+    allstats = {}
+    missing = []
+
+    fnames = keep_latest_stdout(os.listdir(logdir))
+    # print fnames
+    # filenames are in the form <key>.x.stdout
+    # where x is a version number (from a to z)
+    # keep only latest version of each name
+
+    for fname in fnames:
+        if not (
+                'db' + db in fname and
+                fname.startswith(prefix) and
+                fname.endswith('.stdout')
+            ):
+            continue
+        print("parse", fname, end="   ", flush=True)
+        try:
+            indexkey, res, _, stats = parse_result_file(logdir + fname)
+        except RuntimeError as e:
+            print("FAIL %s" % e)
+            res = np.zeros((2, 0))
+        except Exception as e:
+            print("PARSE ERROR " + e)
+            res = np.zeros((2, 0))
+        else:
+            print(len(res), "results")
+        if res.size == 0:
+            missing.append(fname)
+        else:
+            if indexkey in allres:
+                if allstats[indexkey]['run_version'] > stats['run_version']:
+                    # don't use this run
+                    continue
+
+            allres[indexkey] = res
+            allstats[indexkey] = stats
+
+    return allres, allstats
+
+def extract_pareto_optimal(allres, keys, recall_idx=0, times_idx=3):
+    bigtab = []
+    for i, k in enumerate(keys):
+        v = allres[k]
+        perf = v[:, recall_idx]
+        times = v[:, times_idx]
+        bigtab.append(
+            np.vstack((
+                np.ones(times.size) * i,
+                perf, times
+            ))
+        )
+    if bigtab == []:
+        return [], np.zeros((3, 0))
+
+    bigtab = np.hstack(bigtab)
+
+    # sort by perf
+    perm = np.argsort(bigtab[1, :])
+    bigtab_sorted = bigtab[:, perm]
+    best_times = np.minimum.accumulate(bigtab_sorted[2, ::-1])[::-1]
+    selection, = np.where(bigtab_sorted[2, :] == best_times)
+    selected_keys = [
+        keys[i] for i in
+        np.unique(bigtab_sorted[0, selection].astype(int))
+    ]
+    ops = bigtab_sorted[:, selection]
+
+    return selected_keys, ops
+
+def plot_subset(
+    allres, allstats, selected_methods, recall_idx, times_idx=3,
+    report=["overhead", "build time"]):
+
+    # important methods
+    for k in selected_methods:
+        v = allres[k]
+
+        stats = allstats[k]
+        d = stats["d"]
+        dbsize = stats["nb"]
+        if "index_size" in stats and "tables_size" in stats:
+            tot_size = stats['index_size'] + stats['tables_size']
+        else:
+            tot_size = -1
+        id_size = 8 # 64 bit
+
+        addt = ''
+        if 'add_time' in stats:
+            add_time = stats['add_time']
+            if add_time > 7200:
+                add_min = add_time / 60
+                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
+            else:
+                add_sec = int(add_time)
+                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
+
+        code_size = unitsize(d, k)
+
+        label = k
+
+        if "code_size" in report:
+            label += " %d bytes" % code_size
+
+        tight_size = (code_size + id_size) * dbsize
+
+        if tot_size < 0 or "overhead" not in report:
+            pass # don't know what the index size is
+        elif tot_size > 10 * tight_size:
+            label += " overhead x%.1f" % (tot_size / tight_size)
+        else:
+            label += " overhead+%.1f%%" % (
+                tot_size / tight_size * 100 - 100)
+
+        if "build time" in report:
+            label += " " + addt
+
+        linestyle = (':' if 'Refine' in k or 'RFlat' in k else
+                     '-.' if 'SQ' in k else
+                     '-' if '4fs' in k else
+                     '-')
+        print(k, linestyle)
+        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=label,
+                        linestyle=linestyle,
+                        marker='o' if '4fs' in k else '+')
+
+    recall_rank = stats["ranks"][recall_idx]
+    if stats["measure"] == "recall":
+        pyplot.xlabel('1-recall at %d' % recall_rank)
+    elif stats["measure"] == "inter":
+        pyplot.xlabel('inter @ %d' % recall_rank)
+    else:
+        assert False
+    pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])
+
+
+def plot_tradeoffs(db, allres, allstats, code_size, recall_rank):
+    stat0 = next(iter(allstats.values()))
+    d = stat0["d"]
+    n_threads = stat0["n_threads"]
+    recall_idx = stat0["ranks"].index(recall_rank)
+    # times come after the perf measure
+    times_idx = len(stat0["ranks"])
+
+    if type(code_size) == int:
+        if code_size == 0:
+            code_size = [0, 1e50]
+            code_size_name = "any code size"
+        else:
+            code_size_name = "code_size=%d" % code_size
+            code_size = [code_size, code_size]
+    elif type(code_size) == tuple:
+        code_size_name = "code_size in [%d, %d]" % code_size
+    else:
+        assert False
+
+    names_maxperf = []
+
+    for k in sorted(allres):
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(d, k)
+        if not code_size[0] <= us <= code_size[1]: continue
+        names_maxperf.append((v[-1, recall_idx], k))
+
+    # sort from lowest to highest topline accuracy
+    names_maxperf.sort()
+    names = [name for mp, name in names_maxperf]
+
+    selected_methods, optimal_points =  \
+        extract_pareto_optimal(allres, names, recall_idx, times_idx)
+
+    not_selected = list(set(names) - set(selected_methods))
+
+    print("methods without an optimal OP: ", not_selected)
+
+    pyplot.title('database ' + db + ' ' + code_size_name)
+
+    # grayed out lines
+
+    for k in not_selected:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(d, k)
+        if not code_size[0] <= us <= code_size[1]: continue
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=None,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+',
+                        color='#cccccc', linewidth=0.2)
+
+    plot_subset(allres, allstats, selected_methods, recall_idx, times_idx)
+
+
+    if len(not_selected) == 0:
+        om = ''
+    else:
+        om = '\nomitted:'
+        nc = len(om)
+        for m in not_selected:
+            if nc > 80:
+                om += '\n'
+                nc = 0
+            om += ' ' + m
+            nc += len(m) + 1
+
+    # pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s")
+    # print(optimal_points[0, :])
+    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
+    pyplot.ylabel('QPS (%d threads)' % n_threads)
+    pyplot.legend()
+    pyplot.grid()
+    return selected_methods, not_selected
+
+
+
+if __name__ == "__main__xx":
+    # tests on centroids indexing (v1)
+
+    for k in 1, 32, 128:
+        pyplot.gcf().set_size_inches(15, 10)
+        i = 1
+        for ncent in 65536, 262144, 1048576, 4194304:
+            db = f'deep_centroids_{ncent}.k{k}.'
+            allres, allstats = collect_results_for(
+                db=db, prefix="cent_index.")
+
+            pyplot.subplot(2, 2, i)
+            plot_subset(
+                allres, allstats, list(allres.keys()),
+                recall_idx=0,
+                times_idx=1,
+                report=["code_size"]
+            )
+            i += 1
+            pyplot.title(f"{ncent} centroids")
+            pyplot.legend()
+            pyplot.xlim([0.95, 1])
+            pyplot.grid()
+
+        pyplot.savefig('figs/deep1B_centroids_k%d.png' % k)
+
+
+if __name__ == "__main__xx":
+    # centroids plot per k
+
+    pyplot.gcf().set_size_inches(15, 10)
+
+    i=1
+    for ncent in 65536, 262144, 1048576, 4194304:
+
+        xyd = defaultdict(list)
+
+        for k in 1, 4, 8, 16, 32, 64, 128, 256:
+
+            db = f'deep_centroids_{ncent}.k{k}.'
+            allres, allstats = collect_results_for(db=db, prefix="cent_index.")
+
+            for indexkey, res in allres.items():
+                idx, = np.where(res[:, 0] >= 0.99)
+                if idx.size > 0:
+                    xyd[indexkey].append((k, 1000 / res[idx[0], 1]))
+
+        pyplot.subplot(2, 2, i)
+        i += 1
+        for indexkey, xy in xyd.items():
+            xy = np.array(xy)
+            pyplot.loglog(xy[:, 0], xy[:, 1], 'o-', label=indexkey)
+
+        pyplot.title(f"{ncent} centroids")
+        pyplot.xlabel("k")
+        xt = 2**np.arange(9)
+        pyplot.xticks(xt, ["%d" % x for x in xt])
+        pyplot.ylabel("QPS (32 threads)")
+        pyplot.legend()
+        pyplot.grid()
+
+    pyplot.savefig('../plots/deep1B_centroids_min99.png')
+
+
+
+
+
+if __name__ == "__main__xx":
+    # main indexing plots
+
+    i = 0
+    for db in 'bigann10M', 'deep10M', 'bigann100M', 'deep100M', 'deep1B', 'bigann1B':
+        allres, allstats = collect_results_for(
+            db=db, prefix="autotune.")
+
+        for cs in 8, 16, 32, 64:
+            pyplot.figure(i)
+            i += 1
+            pyplot.gcf().set_size_inches(15, 10)
+
+            cs_range = (
+                (0, 8) if cs == 8 else (cs // 2 + 1, cs)
+            )
+
+            plot_tradeoffs(
+                db, allres, allstats, code_size=cs_range, recall_rank=1)
+            pyplot.savefig('../plots/tradeoffs_%s_cs%d_r1.png' % (
+                   db, cs))
+
+
+if __name__ == "__main__":
+    # 1M indexes
+    i = 0
+    for db in "glove", "music-100":
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        allres, allstats = collect_results_for(db=db, prefix="autotune.")
+        plot_tradeoffs(db, allres, allstats, code_size=0, recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + ".png")
+
+    for db in "sift1M", "deep1M":
+        allres, allstats = collect_results_for(db=db, prefix="autotune.")
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        plot_tradeoffs(db, allres, allstats, code_size=(0, 64), recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_small.png")
+
+        pyplot.figure(i)
+        pyplot.gcf().set_size_inches(15, 10)
+        i += 1
+        plot_tradeoffs(db, allres, allstats, code_size=(65, 10000), recall_rank=1)
+        pyplot.savefig('../plots/1M_tradeoffs_' + db + "_large.png")
+
+
+
+if __name__ == "__main__xx":
+    db = 'sift1M'
+    allres, allstats = collect_results_for(db=db, prefix="autotune.")
+    pyplot.gcf().set_size_inches(15, 10)
+
+    keys = [
+        "IVF1024,PQ32x8",
+        "IVF1024,PQ64x4",
+        "IVF1024,PQ64x4fs",
+        "IVF1024,PQ64x4fsr",
+        "IVF1024,SQ4",
+        "IVF1024,SQ8"
+    ]
+
+    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
+
+    pyplot.legend()
+    pyplot.title(db)
+    pyplot.xlabel("1-recall@1")
+    pyplot.ylabel("QPS (32 threads)")
+    pyplot.grid()
+
+    pyplot.savefig('../plots/ivf1024_variants.png')
+
+    pyplot.figure(2)
+    pyplot.gcf().set_size_inches(15, 10)
+
+    keys = [
+        "HNSW32",
+        "IVF1024,PQ64x4fs",
+        "IVF1024,PQ64x4fsr",
+        "IVF1024,PQ64x4fs,RFlat",
+        "IVF1024,PQ64x4fs,Refine(SQfp16)",
+        "IVF1024,PQ64x4fs,Refine(SQ8)",
+    ]
+
+    plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
+
+    pyplot.legend()
+    pyplot.title(db)
+    pyplot.xlabel("1-recall@1")
+    pyplot.ylabel("QPS (32 threads)")
+    pyplot.grid()
+
+    pyplot.savefig('../plots/ivf1024_rerank.png')
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_all_ivf/run_on_cluster_generic.bash
@@ -0,0 +1,603 @@
+set -e
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @nolint
+
+# This script launches the experiments on a cluster
+# It assumes two shell functions are defined:
+#
+#    run_on_1machine: runs a command on one (full) machine on a cluster
+#
+#    run_on_8gpu: runs a command on one machine with 8 GPUs
+#
+# the two functions are called as:
+#
+#    run_on_1machine <name> <command>
+#
+# the stdout of the command should be stored in $logdir/<name>.stdout
+
+
+function run_on ()
+{
+    sys="$1"
+    shift
+    name="$1"
+    shift
+    script="$logdir/$name.sh"
+
+    if [ -e "$script" ]; then
+        echo script "$script" exists
+        return
+    fi
+
+    # srun handles special characters fine, but the shell interpreter
+    # does not
+    escaped_cmd=$( printf "%q " "$@" )
+
+    cat > $script <<EOF
+#! /bin/bash
+srun $escaped_cmd
+EOF
+
+    echo -n "$logdir/$name.stdout "
+    sbatch -n1 -J "$name" \
+           $sys \
+            --comment='priority is the only one that works'  \
+           --output="$logdir/$name.stdout" \
+           "$script"
+
+}
+
+
+function run_on_1machine {
+    run_on "--cpus-per-task=80 --gres=gpu:0 --mem=500G --time=70:00:00 --partition=priority" "$@"
+}
+
+function run_on_1machine_1h {
+    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=1:00:00 --partition=priority" "$@"
+}
+
+function run_on_1machine_3h {
+    run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=3:00:00 --partition=priority" "$@"
+}
+
+function run_on_4gpu_3h {
+    run_on "--cpus-per-task=40 --gres=gpu:4 --mem=100G --time=3:00:00 --partition=priority" "$@"
+}
+
+function run_on_8gpu () {
+    run_on "--cpus-per-task=80 --gres=gpu:8 --mem=100G --time=70:00:00 --partition=priority" "$@"
+}
+
+
+# prepare output directories
+# set to some directory where all indexes, can be written.
+basedir=/checkpoint/matthijs/bench_all_ivf
+
+logdir=$basedir/logs
+indexdir=$basedir/indexes
+centdir=$basedir/precomputed_clusters
+
+mkdir -p $logdir $indexdir
+
+
+# adds an option to use a pretrained quantizer
+function add_precomputed_quantizer () {
+    local db="$1"
+    local coarse="$2"
+
+    case $db in
+        bigann*) rname=bigann ;;
+        deep*)   rname=deep ;;
+        sift1M) return;;
+        music-100) return ;;
+        glove) return ;;
+        *) echo "bad db"; exit 1;;
+    esac
+
+    case $coarse in
+        IVF65536*)
+            cname=clustering.db${rname}1M.IVF65536.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF262144*)
+            cname=clustering.db${rname}1M.IVF262144.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF1048576*)
+            cname=clustering.db${rname}1M.IVF1048576.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        IVF4194304*)
+            cname=clustering.db${rname}1M.IVF4194304.faissindex
+            copt="--get_centroids_from $centdir/$cname"
+            ;;
+        *)
+        copt="" ;;
+    esac
+
+    echo $copt
+}
+
+function get_db_dim () {
+    local db="$1"
+    case $db in
+        sift1M) dim=128;;
+        bigann*) dim=128;;
+        deep*) dim=96;;
+        music-100) dim=100;;
+        glove) dim=100;;
+        *) echo "bad db"; exit 1;;
+    esac
+    echo $dim
+}
+
+
+# replace HD = half dim with the half of the dimension we need to handle
+# relying that variables are global by default...
+function replace_coarse_PQHD () {
+    local coarse="$1"
+    local dim=$2
+
+
+    coarseD=${coarse//PQHD/PQ$((dim/2))}
+    coarse16=${coarse//PQHD/PQ8}
+    coarse32=${coarse//PQHD/PQ16}
+    coarse64=${coarse//PQHD/PQ32}
+    coarse128=${coarse//PQHD/PQ64}
+    coarse256=${coarse//PQHD/PQ128}
+    coarse112=${coarse//PQHD/PQ56}
+
+}
+
+
+
+if false; then
+
+
+
+###############################################
+# comparison with SCANN
+
+for db in sift1M deep1M glove music-100
+do
+    opt=""
+    if [ $db == glove ]; then
+        opt="--measure inter"
+    fi
+
+    run_on_1machine_1h cmp_with_scann.$db.c \
+        python -u cmp_with_scann.py --db $db \
+        --lib faiss $opt --thenscann
+
+done
+
+
+
+
+############################### Preliminary SIFT1M experiment
+
+
+for db in sift1M  ; do
+
+    for coarse in  IVF1024
+    do
+        indexkeys="
+            HNSW32
+            $coarse,SQfp16
+            $coarse,SQ4
+            $coarse,SQ8
+            $coarse,PQ32x8
+            $coarse,PQ64x4
+            $coarse,PQ64x4fs
+            $coarse,PQ64x4fs,RFlat
+            $coarse,PQ64x4fs,Refine(SQfp16)
+            $coarse,PQ64x4fs,Refine(SQ8)
+            OPQ64,$coarse,PQ64x4fs
+            OPQ64,$coarse,PQ64x4fs,RFlat
+        "
+        indexkeys="
+            $coarse,PQ64x4fsr
+            $coarse,PQ64x4fsr,RFlat
+        "
+
+        # OPQ actually degrades the results on SIFT1M, so let's ignore
+
+        for indexkey in $indexkeys
+        do
+            # escape nasty characters
+            key="autotune.db$db.${indexkey//,/_}"
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_1h $key.a \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32
+        done
+    done
+done
+
+
+
+
+############################### 1M experiments
+
+fi
+# for db in sift1M deep1M music-100 glove; do
+
+for db in glove music-100; do
+
+    dim=$( get_db_dim $db )
+
+    for coarse in IVF1024 IVF4096_HNSW32
+    do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+            $coarseD,PQ$((dim/2))x4fsr
+
+            OPQ8_64,$coarse64,PQ8
+            PCAR16,$coarse16,SQ4
+            OPQ16_64,$coarse64,PQ16x4fs
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            PCAR16,$coarse16,SQ8
+            PCAR32,$coarse32,SQ4
+            OPQ32_64,$coarse64,PQ32x4fs
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR32,$coarse32,SQ8
+            PCAR64,$coarse64,SQ4
+            PCAR16,$coarse16,SQfp16
+            OPQ64_128,$coarse128,PQ64x4fs
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            PCAR64,$coarse64,SQ8
+            PCAR32,$coarse32,SQfp16
+            PCAR128,$coarse128,SQ4
+            OPQ128_256,$coarse256,PQ128x4fs
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
+            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
+            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
+            OPQ64_128,$coarse,PQ64x12
+
+            OPQ64_128,$coarse,PQ64x4fs,RFlat
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQfp16)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ8)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ6)
+            OPQ64_128,$coarse,PQ64x4fs,Refine(SQ4)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQfp16)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ8)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ6)
+            OPQ32_64,$coarse,PQ32x4fs,Refine(SQ4)
+
+        "
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+            $coarseD,PQ$((dim/2))x4fsr
+            $coarseD,PQ$((dim/2))x4fsr,RFlat
+            $coarseD,PQ$((dim/2))x4fsr,Refine(SQfp16)
+            $coarseD,PQ$((dim/2))x4fsr,Refine(SQ8)
+            $coarseD,PQ$((dim/4))x4fs
+            $coarseD,PQ$((dim/4))x4fsr
+            $coarseD,PQ$((dim/4))x4fsr,RFlat
+            $coarseD,PQ$((dim/4))x4fsr,Refine(SQfp16)
+            $coarseD,PQ$((dim/4))x4fsr,Refine(SQ8)
+            $coarseD,PQ$((dim/2))
+            $coarseD,PQ$((dim/4))
+            HNSW32,Flat
+        "
+
+        indexkeys="HNSW32,Flat"
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_3h $key.q \
+              python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile "$indexdir/$key.faissindex" \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --searchthreads 32 \
+                    --min_test_duration 3
+        done
+
+
+    done
+done
+
+if false; then
+
+############################################
+# precompute centroids on GPU for large vocabularies
+
+for db in deep1M bigann1M; do
+
+    for ncent in 262144 65536 1048576 4194304; do
+
+        key=clustering.db$db.IVF$ncent
+        run_on_4gpu_3h $key.e \
+            python -u bench_all_ivf.py \
+                --db $db \
+                --indexkey IVF$ncent,SQ8 \
+                --maxtrain 100000000  \
+                --indexfile $centdir/$key.faissindex \
+                --searchthreads 32 \
+                --min_test_duration 3 \
+                --add_bs 1000000 \
+                --train_on_gpu
+
+    done
+done
+
+###############################
+## coarse quantizer experiments on the centroids of deep1B
+
+
+for k in 4 8 16 64 256; do
+
+    for ncent in 65536 262144 1048576 4194304; do
+        db=deep_centroids_$ncent
+
+        # compute square root of ncent...
+        for(( ls=0; ncent > (1 << (2 * ls)); ls++)); do
+            echo -n
+        done
+        sncent=$(( 1 << ls ))
+
+        indexkeys="
+            IVF$((sncent/2)),PQ48x4fs,RFlat
+            IVF$((sncent*2)),PQ48x4fs,RFlat
+            HNSW32
+            PQ48x4fs
+            PQ48x4fs,RFlat
+            IVF$sncent,PQ48x4fs,RFlat
+        "
+
+        for indexkey in $indexkeys; do
+            key="cent_index.db$db.k$k.$indexkey"
+            run_on_1machine_1h "$key.b" \
+                    python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --inter \
+                    --searchthreads 32 \
+                    --k $k
+        done
+
+    done
+done
+
+
+############################### 10M experiments
+
+
+for db in deep10M bigann10M; do
+
+    coarses="
+        IVF65536(IVF256,PQHDx4fs,RFlat)
+        IVF16384_HNSW32
+        IVF65536_HNSW32
+        IVF262144_HNSW32
+        IVF262144(IVF512,PQHDx4fs,RFlat)
+    "
+
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses
+    do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            $coarseD,PQ$((dim/2))x4fs
+
+            OPQ8_64,$coarse64,PQ8
+            PCAR16,$coarse16,SQ4
+            OPQ16_64,$coarse64,PQ16x4fs
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            PCAR16,$coarse16,SQ8
+            PCAR32,$coarse32,SQ4
+            OPQ32_64,$coarse64,PQ32x4fs
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR32,$coarse32,SQ8
+            PCAR64,$coarse64,SQ4
+            PCAR16,$coarse16,SQfp16
+            OPQ64_128,$coarse128,PQ64x4fs
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            PCAR64,$coarse64,SQ8
+            PCAR32,$coarse32,SQfp16
+            PCAR128,$coarse128,SQ4
+            OPQ128_256,$coarse256,PQ128x4fs
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
+            OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
+            OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
+        "
+
+        indexkeys="
+            OPQ16_64,$coarse64,PQ16x4fsr
+            OPQ32_64,$coarse64,PQ32x4fsr
+            OPQ64_128,$coarse128,PQ64x4fsr
+            OPQ128_256,$coarse256,PQ128x4fsr
+        "
+
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine_3h $key.l \
+              python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile "$indexdir/$key.faissindex" \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    --autotune_max nprobe:2000
+        done
+    done
+done
+
+
+############################### 100M experiments
+
+for db in deep100M bigann100M; do
+    coarses="
+        IVF65536_HNSW32
+        IVF262144_HNSW32
+        IVF262144(IVF512,PQHDx4fs,RFlat)
+        IVF1048576_HNSW32
+        IVF1048576(IVF1024,PQHDx4fs,RFlat)
+    "
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses
+    do
+        replace_coarse_PQHD "$coarse" $dim
+
+        indexkeys="
+            OPQ8_64,$coarse64,PQ8
+            OPQ16_64,$coarse64,PQ16x4fs
+
+            PCAR32,$coarse32,SQ4
+            OPQ16_64,$coarse64,PQ16
+            OPQ32_64,$coarse64,PQ32x4fs
+
+            OPQ32_128,$coarse128,PQ32
+            PCAR64,$coarse64,SQ4
+            PCAR32,$coarse32,SQ8
+            OPQ64_128,$coarse128,PQ64x4fs
+
+            PCAR128,$coarse128,SQ4
+            OPQ64_128,$coarse128,PQ64
+
+            PCAR32,$coarse32,SQfp16
+            PCAR64,$coarse64,SQ8
+            OPQ128_256,$coarse256,PQ128x4fs
+
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+
+            $coarseD,PQ$((dim/2))x4fs
+        "
+
+        indexkeys="
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ64_128,$coarse128,PQ64x4fsr
+            OPQ32_64,$coarse64,PQ32x4fsr
+            OPQ16_64,$coarse64,PQ16x4fsr
+            OPQ16_64,$coarse64,PQ16x4fsr,Refine(OPQ56_112,PQ56)
+        "
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine $key.e \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --add_bs 1000000 \
+                    --autotune_max nprobe:2000
+
+        done
+    done
+done
+
+
+#################################
+# 1B-scale experiment
+
+
+
+for db in deep1B bigann1B; do
+    coarses="
+        IVF1048576_HNSW32
+        IVF4194304_HNSW32
+        IVF4194304(IVF1024,PQHDx4fs,RFlat)
+    "
+    dim=$( get_db_dim $db )
+
+    for coarse in $coarses; do
+
+        replace_coarse_PQHD "$coarse" $dim
+
+
+        indexkeys="
+            OPQ8_64,$coarse64,PQ8
+            OPQ16_64,$coarse64,PQ16x4fsr
+
+            OPQ16_64,$coarse64,PQ16
+            OPQ32_64,$coarse64,PQ32x4fsr
+
+            OPQ32_128,$coarse128,PQ32
+            OPQ64_128,$coarse128,PQ64x4fsr
+
+            OPQ64_128,$coarse128,PQ64
+            OPQ128_256,$coarse256,PQ128x4fsr
+            OPQ56_112,$coarse112,PQ7+56
+            OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
+
+            $coarseD,PQ$((dim/2))x4fs
+        "
+
+        for indexkey in $indexkeys
+        do
+            key=autotune.db$db.${indexkey//,/_}
+            key="${key//(/_}"
+            key="${key//)/_}"
+            run_on_1machine $key.d \
+                 python -u bench_all_ivf.py \
+                    --db $db \
+                    --indexkey "$indexkey" \
+                    --maxtrain 0  \
+                    --indexfile $indexdir/$key.faissindex \
+                    --searchthreads 32 \
+                    --min_test_duration 3 \
+                    $( add_precomputed_quantizer $db $coarse ) \
+                    --add_bs 1000000 \
+                    --autotune_max nprobe:3000
+        done
+    done
+
+done
+
+fi