Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/benchs/bench_fw/utils.py
@@ -0,0 +1,248 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+from enum import Enum
+from multiprocessing.pool import ThreadPool
+from time import perf_counter
+
+import faiss  # @manual=//faiss/python:pyfaiss
+import numpy as np
+
+from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib
+    OperatingPoints,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def timer(name, func, once=False) -> float:
+    logger.info(f"Measuring {name}")
+    t1 = perf_counter()
+    res = func()
+    t2 = perf_counter()
+    t = t2 - t1
+    repeat = 1
+    if not once and t < 1.0:
+        repeat = int(2.0 // t)
+        logger.info(
+            f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
+        )
+        t1 = perf_counter()
+        for _ in range(repeat):
+            res = func()
+        t2 = perf_counter()
+        t = (t2 - t1) / repeat
+    logger.info(f"Time for {name}: {t:.3f} seconds")
+    return res, t, repeat
+
+
+def refine_distances_knn(
+    xq: np.ndarray,
+    xb: np.ndarray,
+    I: np.ndarray,
+    metric,
+):
+    """Recompute distances between xq[i] and xb[I[i, :]]"""
+    nq, k = I.shape
+    xq = np.ascontiguousarray(xq, dtype="float32")
+    nq2, d = xq.shape
+    xb = np.ascontiguousarray(xb, dtype="float32")
+    nb, d2 = xb.shape
+    I = np.ascontiguousarray(I, dtype="int64")
+    assert nq2 == nq
+    assert d2 == d
+    D = np.empty(I.shape, dtype="float32")
+    D[:] = np.inf
+    if metric == faiss.METRIC_L2:
+        faiss.fvec_L2sqr_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    else:
+        faiss.fvec_inner_products_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    return D
+
+
+def refine_distances_range(
+    lims: np.ndarray,
+    D: np.ndarray,
+    I: np.ndarray,
+    xq: np.ndarray,
+    xb: np.ndarray,
+    metric,
+):
+    with ThreadPool(32) as pool:
+        R = pool.map(
+            lambda i: (
+                np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
+                if metric == faiss.METRIC_L2
+                else np.tensordot(
+                    xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
+                )
+            )
+            if lims[i + 1] > lims[i]
+            else [],
+            range(len(lims) - 1),
+        )
+    return np.hstack(R)
+
+
+def distance_ratio_measure(I, R, D_GT, metric):
+    sum_of_R = np.sum(np.where(I >= 0, R, 0))
+    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
+    if metric == faiss.METRIC_INNER_PRODUCT:
+        return (sum_of_R / sum_of_D_GT).item()
+    elif metric == faiss.METRIC_L2:
+        return (sum_of_D_GT / sum_of_R).item()
+    else:
+        raise RuntimeError(f"unknown metric {metric}")
+
+
+@functools.cache
+def get_cpu_info():
+    return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
+        13:
+    ].strip()
+
+
+def dict_merge(target, source):
+    for k, v in source.items():
+        if isinstance(v, dict) and k in target:
+            dict_merge(target[k], v)
+        else:
+            target[k] = v
+
+
+class Cost:
+    def __init__(self, values):
+        self.values = values
+
+    def __le__(self, other):
+        return all(
+            v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+    def __lt__(self, other):
+        return all(
+            v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+
+class ParetoMode(Enum):
+    DISABLE = 1  # no Pareto filtering
+    INDEX = 2  # index-local optima
+    GLOBAL = 3  # global optima
+
+
+class ParetoMetric(Enum):
+    TIME = 0  # time vs accuracy
+    SPACE = 1  # space vs accuracy
+    TIME_SPACE = 2  # (time, space) vs accuracy
+
+
+def range_search_recall_at_precision(experiment, precision):
+    return round(
+        max(
+            r
+            for r, p in zip(
+                experiment["range_search_pr"]["recall"],
+                experiment["range_search_pr"]["precision"],
+            )
+            if p > precision
+        ),
+        6,
+    )
+
+
+def filter_results(
+    results,
+    evaluation,
+    accuracy_metric,  # str or func
+    time_metric=None,  # func or None -> use default
+    space_metric=None,  # func or None -> use default
+    min_accuracy=0,
+    max_space=0,
+    max_time=0,
+    scaling_factor=1.0,
+    name_filter=None,  # func
+    pareto_mode=ParetoMode.DISABLE,
+    pareto_metric=ParetoMetric.TIME,
+):
+    if isinstance(accuracy_metric, str):
+        accuracy_key = accuracy_metric
+        accuracy_metric = lambda v: v[accuracy_key]
+
+    if time_metric is None:
+        time_metric = lambda v: v["time"] * scaling_factor + (
+            v["quantizer"]["time"] if "quantizer" in v else 0
+        )
+
+    if space_metric is None:
+        space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
+
+    fe = []
+    ops = {}
+    if pareto_mode == ParetoMode.GLOBAL:
+        op = OperatingPoints()
+        ops["global"] = op
+    for k, v in results["experiments"].items():
+        if f".{evaluation}" in k:
+            accuracy = accuracy_metric(v)
+            if min_accuracy > 0 and accuracy < min_accuracy:
+                continue
+            space = space_metric(v)
+            if space is None:
+                space = 0
+            if max_space > 0 and space > max_space:
+                continue
+            time = time_metric(v)
+            if max_time > 0 and time > max_time:
+                continue
+            idx_name = v["index"] + (
+                "snap"
+                if "search_params" in v and v["search_params"]["snap"] == 1
+                else ""
+            )
+            if name_filter is not None and not name_filter(idx_name):
+                continue
+            experiment = (accuracy, space, time, k, v)
+            if pareto_mode == ParetoMode.DISABLE:
+                fe.append(experiment)
+                continue
+            if pareto_mode == ParetoMode.INDEX:
+                if idx_name not in ops:
+                    ops[idx_name] = OperatingPoints()
+                op = ops[idx_name]
+            if pareto_metric == ParetoMetric.TIME:
+                op.add_operating_point(experiment, accuracy, time)
+            elif pareto_metric == ParetoMetric.SPACE:
+                op.add_operating_point(experiment, accuracy, space)
+            else:
+                op.add_operating_point(
+                    experiment, accuracy, Cost([time, space])
+                )
+
+    if ops:
+        for op in ops.values():
+            for v, _, _ in op.operating_points:
+                fe.append(v)
+
+    fe.sort()
+    return fe