Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/init.py
+++ b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/init.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from .build_memory_index import build_random_vectors_and_memory_index
+from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors
+from .recall import calculate_recall
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
+++ b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+import os
+from tempfile import mkdtemp
+
+import diskannpy as dap
+import numpy as np
+
+from .create_test_data import random_vectors
+
+
+def build_random_vectors_and_memory_index(
+    dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345
+):
+    query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed)
+    index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed)
+    ann_dir = mkdtemp()
+
+    if with_tags:
+        rng = np.random.default_rng(seed)
+        tags = np.arange(start=1, stop=10001, dtype=np.uint32)
+        rng.shuffle(tags)
+    else:
+        tags = ""
+
+    dap.build_memory_index(
+        data=index_vectors,
+        distance_metric=metric,
+        index_directory=ann_dir,
+        graph_degree=16,
+        complexity=32,
+        alpha=1.2,
+        num_threads=0,
+        use_pq_build=False,
+        num_pq_bytes=8,
+        use_opq=False,
+        filter_complexity=32,
+        tags=tags,
+        index_prefix=index_prefix,
+    )
+
+    return (
+        metric,
+        dtype,
+        query_vectors,
+        index_vectors,
+        ann_dir,
+        os.path.join(ann_dir, "vectors.bin"),
+        tags,
+    )
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
+++ b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from contextlib import contextmanager
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import BinaryIO
+
+import numpy as np
+
+
+def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray:
+    rng = np.random.default_rng(seed)
+    if dtype == np.float32:
+        vectors = rng.random((rows, dimensions), dtype=dtype)
+    elif dtype == np.uint8:
+        vectors = rng.integers(
+            low=0, high=256, size=(rows, dimensions), dtype=dtype
+        )  # low is inclusive, high is exclusive
+    elif dtype == np.int8:
+        vectors = rng.integers(
+            low=-128, high=128, size=(rows, dimensions), dtype=dtype
+        )  # low is inclusive, high is exclusive
+    else:
+        raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported")
+    return vectors
+
+
+def write_vectors(file_handler: BinaryIO, vectors: np.ndarray):
+    _ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes())
+    _ = file_handler.write(vectors.tobytes())
+
+
+@contextmanager
+def vectors_as_temp_file(vectors: np.ndarray) -> str:
+    temp = NamedTemporaryFile(mode="wb", delete=False)
+    write_vectors(temp, vectors)
+    temp.close()
+    yield temp.name
+    Path(temp.name).unlink()
--- a/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
+++ b/packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+import numpy as np
+
+
+def calculate_recall(
+    result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5
+) -> float:
+    """
+    result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
+    the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
+    being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
+    :param result_set_indices:
+    :param truth_set_indices:
+    :param recall_at:
+    :return:
+    """
+    found = 0
+    for i in range(0, result_set_indices.shape[0]):
+        result_set_set = set(result_set_indices[i][0:recall_at])
+        truth_set_set = set(truth_set_indices[i][0:recall_at])
+        found += len(result_set_set.intersection(truth_set_set))
+    return found / (result_set_indices.shape[0] * recall_at)