Initial commit
This commit is contained in:
6
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/__init__.py
vendored
Normal file
6
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/__init__.py
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from .build_memory_index import build_random_vectors_and_memory_index
|
||||
from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors
|
||||
from .recall import calculate_recall
|
||||
51
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
vendored
Normal file
51
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/build_memory_index.py
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import os
|
||||
from tempfile import mkdtemp
|
||||
|
||||
import diskannpy as dap
|
||||
import numpy as np
|
||||
|
||||
from .create_test_data import random_vectors
|
||||
|
||||
|
||||
def build_random_vectors_and_memory_index(
|
||||
dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345
|
||||
):
|
||||
query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed)
|
||||
index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed)
|
||||
ann_dir = mkdtemp()
|
||||
|
||||
if with_tags:
|
||||
rng = np.random.default_rng(seed)
|
||||
tags = np.arange(start=1, stop=10001, dtype=np.uint32)
|
||||
rng.shuffle(tags)
|
||||
else:
|
||||
tags = ""
|
||||
|
||||
dap.build_memory_index(
|
||||
data=index_vectors,
|
||||
distance_metric=metric,
|
||||
index_directory=ann_dir,
|
||||
graph_degree=16,
|
||||
complexity=32,
|
||||
alpha=1.2,
|
||||
num_threads=0,
|
||||
use_pq_build=False,
|
||||
num_pq_bytes=8,
|
||||
use_opq=False,
|
||||
filter_complexity=32,
|
||||
tags=tags,
|
||||
index_prefix=index_prefix,
|
||||
)
|
||||
|
||||
return (
|
||||
metric,
|
||||
dtype,
|
||||
query_vectors,
|
||||
index_vectors,
|
||||
ann_dir,
|
||||
os.path.join(ann_dir, "vectors.bin"),
|
||||
tags,
|
||||
)
|
||||
40
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
vendored
Normal file
40
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/create_test_data.py
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import BinaryIO
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray:
|
||||
rng = np.random.default_rng(seed)
|
||||
if dtype == np.float32:
|
||||
vectors = rng.random((rows, dimensions), dtype=dtype)
|
||||
elif dtype == np.uint8:
|
||||
vectors = rng.integers(
|
||||
low=0, high=256, size=(rows, dimensions), dtype=dtype
|
||||
) # low is inclusive, high is exclusive
|
||||
elif dtype == np.int8:
|
||||
vectors = rng.integers(
|
||||
low=-128, high=128, size=(rows, dimensions), dtype=dtype
|
||||
) # low is inclusive, high is exclusive
|
||||
else:
|
||||
raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported")
|
||||
return vectors
|
||||
|
||||
|
||||
def write_vectors(file_handler: BinaryIO, vectors: np.ndarray):
|
||||
_ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes())
|
||||
_ = file_handler.write(vectors.tobytes())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def vectors_as_temp_file(vectors: np.ndarray) -> str:
|
||||
temp = NamedTemporaryFile(mode="wb", delete=False)
|
||||
write_vectors(temp, vectors)
|
||||
temp.close()
|
||||
yield temp.name
|
||||
Path(temp.name).unlink()
|
||||
24
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
vendored
Normal file
24
packages/leann-backend-diskann/third_party/DiskANN/python/tests/fixtures/recall.py
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def calculate_recall(
|
||||
result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5
|
||||
) -> float:
|
||||
"""
|
||||
result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
|
||||
the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
|
||||
being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
|
||||
:param result_set_indices:
|
||||
:param truth_set_indices:
|
||||
:param recall_at:
|
||||
:return:
|
||||
"""
|
||||
found = 0
|
||||
for i in range(0, result_set_indices.shape[0]):
|
||||
result_set_set = set(result_set_indices[i][0:recall_at])
|
||||
truth_set_set = set(truth_set_indices[i][0:recall_at])
|
||||
found += len(result_set_set.intersection(truth_set_set))
|
||||
return found / (result_set_indices.shape[0] * recall_at)
|
||||
Reference in New Issue
Block a user