Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from .build_memory_index import build_random_vectors_and_memory_index
from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors
from .recall import calculate_recall

View File

@@ -0,0 +1,51 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import os
from tempfile import mkdtemp
import diskannpy as dap
import numpy as np
from .create_test_data import random_vectors
def build_random_vectors_and_memory_index(
dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345
):
query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed)
index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed)
ann_dir = mkdtemp()
if with_tags:
rng = np.random.default_rng(seed)
tags = np.arange(start=1, stop=10001, dtype=np.uint32)
rng.shuffle(tags)
else:
tags = ""
dap.build_memory_index(
data=index_vectors,
distance_metric=metric,
index_directory=ann_dir,
graph_degree=16,
complexity=32,
alpha=1.2,
num_threads=0,
use_pq_build=False,
num_pq_bytes=8,
use_opq=False,
filter_complexity=32,
tags=tags,
index_prefix=index_prefix,
)
return (
metric,
dtype,
query_vectors,
index_vectors,
ann_dir,
os.path.join(ann_dir, "vectors.bin"),
tags,
)

View File

@@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from contextlib import contextmanager
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import BinaryIO
import numpy as np
def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray:
rng = np.random.default_rng(seed)
if dtype == np.float32:
vectors = rng.random((rows, dimensions), dtype=dtype)
elif dtype == np.uint8:
vectors = rng.integers(
low=0, high=256, size=(rows, dimensions), dtype=dtype
) # low is inclusive, high is exclusive
elif dtype == np.int8:
vectors = rng.integers(
low=-128, high=128, size=(rows, dimensions), dtype=dtype
) # low is inclusive, high is exclusive
else:
raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported")
return vectors
def write_vectors(file_handler: BinaryIO, vectors: np.ndarray):
_ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes())
_ = file_handler.write(vectors.tobytes())
@contextmanager
def vectors_as_temp_file(vectors: np.ndarray) -> str:
temp = NamedTemporaryFile(mode="wb", delete=False)
write_vectors(temp, vectors)
temp.close()
yield temp.name
Path(temp.name).unlink()

View File

@@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import numpy as np
def calculate_recall(
result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5
) -> float:
"""
result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
:param result_set_indices:
:param truth_set_indices:
:param recall_at:
:return:
"""
found = 0
for i in range(0, result_set_indices.shape[0]):
result_set_set = set(result_set_indices[i][0:recall_at])
truth_set_set = set(truth_set_indices[i][0:recall_at])
found += len(result_set_set.intersection(truth_set_set))
return found / (result_set_indices.shape[0] * recall_at)