Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,82 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
cmake_minimum_required(VERSION 3.18...3.22)
set(CMAKE_CXX_STANDARD 17)
if (PYTHON_EXECUTABLE)
set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE})
endif()
find_package(Python3 COMPONENTS Interpreter Development.Module NumPy REQUIRED)
execute_process(COMMAND ${Python3_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
OUTPUT_VARIABLE _tmp_dir
OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT)
list(APPEND CMAKE_PREFIX_PATH "${_tmp_dir}")
# Now we can find pybind11
find_package(pybind11 CONFIG REQUIRED)
execute_process(COMMAND ${Python3_EXECUTABLE} -c "import numpy; print(numpy.get_include())"
OUTPUT_VARIABLE _numpy_include
OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ECHO STDOUT)
# pybind11_add_module(diskannpy MODULE src/diskann_bindings.cpp)
# the following is fairly synonymous with pybind11_add_module, but we need more target_link_libraries
# see https://pybind11.readthedocs.io/en/latest/compiling.html#advanced-interface-library-targets for more details
add_library(_diskannpy MODULE
src/module.cpp
src/builder.cpp
src/dynamic_memory_index.cpp
src/static_memory_index.cpp
src/static_disk_index.cpp
)
target_include_directories(_diskannpy AFTER PRIVATE include)
if (MSVC)
target_compile_options(_diskannpy PRIVATE /U_WINDLL)
endif()
target_link_libraries(
_diskannpy
PRIVATE
pybind11::module
pybind11::lto
pybind11::windows_extras
${PROJECT_NAME}
${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS}
${DISKANN_ASYNC_LIB}
)
pybind11_extension(_diskannpy)
if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
# Strip unnecessary sections of the binary on Linux/macOS
pybind11_strip(_diskannpy)
endif()
set_target_properties(_diskannpy PROPERTIES CXX_VISIBILITY_PRESET "hidden"
CUDA_VISIBILITY_PRESET "hidden")
# generally, the VERSION_INFO flag is set by pyproject.toml, by way of setup.py.
# attempts to locate the version within CMake fail because the version has to be available
# to pyproject.toml for the sdist to work after we build it.
if(NOT VERSION_INFO)
set(VERSION_INFO "0.0.0dev")
endif()
target_compile_definitions(_diskannpy PRIVATE VERSION_INFO="${VERSION_INFO}")
# Add a post-build command to automatically copy the compiled Python module
if(UPDATE_EDITABLE_INSTALL)
add_custom_command(
TARGET _diskannpy
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_BINARY_DIR}/_diskannpy.cpython-*.so
${CMAKE_SOURCE_DIR}/python/src/
COMMENT "Copying Python module to python/src directory"
)
endif()

View File

@@ -0,0 +1,55 @@
# diskannpy
[![DiskANN Paper](https://img.shields.io/badge/Paper-NeurIPS%3A_DiskANN-blue)](https://papers.nips.cc/paper/9527-rand-nsg-fast-accurate-billion-point-nearest-neighbor-search-on-a-single-node.pdf)
[![DiskANN Paper](https://img.shields.io/badge/Paper-Arxiv%3A_Fresh--DiskANN-blue)](https://arxiv.org/abs/2105.09613)
[![DiskANN Paper](https://img.shields.io/badge/Paper-Filtered--DiskANN-blue)](https://harsha-simhadri.org/pubs/Filtered-DiskANN23.pdf)
[![DiskANN Main](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml/badge.svg?branch=main)](https://github.com/microsoft/DiskANN/actions/workflows/push-test.yml)
[![PyPI version](https://img.shields.io/pypi/v/diskannpy.svg)](https://pypi.org/project/diskannpy/)
[![Downloads shield](https://pepy.tech/badge/diskannpy)](https://pepy.tech/project/diskannpy)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
## Installation
Packages published to PyPI will always be built using the latest numpy major.minor release (at this time, 1.25).
Conda distributions for versions 1.19-1.25 will be completed as a future effort. In the meantime, feel free to
clone this repository and build it yourself.
## Local Build Instructions
Please see the [Project README](https://github.com/microsoft/DiskANN/blob/main/README.md) for system dependencies and requirements.
After ensuring you've followed the directions to build the project library and executables, you will be ready to also
build `diskannpy` with these additional instructions.
### Changing Numpy Version
In the root folder of DiskANN, there is a file `pyproject.toml`. You will need to edit the version of numpy in both the
`[build-system.requires]` section, as well as the `[project.dependencies]` section. The version numbers must match.
#### Linux
```bash
python3.11 -m venv venv # versions from python3.9 and up should work
source venv/bin/activate
pip install build
python -m build
```
#### Windows
```powershell
py -3.11 -m venv venv # versions from python3.9 and up should work
venv\Scripts\Activate.ps1
pip install build
python -m build
```
The built wheel will be placed in the `dist` directory in your DiskANN root. Install it using `pip install dist/<wheel name>.whl`
## Citations
Please cite this software in your work as:
```
@misc{diskann-github,
author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan and Patel, Yash}},
title = {{DiskANN: Graph-structured Indices for Scalable, Fast, Fresh and Filtered Approximate Nearest Neighbor Search}},
url = {https://github.com/Microsoft/DiskANN},
version = {0.6.1},
year = {2023}
}
```

View File

@@ -0,0 +1,152 @@
import diskannpy as dap
import numpy as np
import numpy.typing as npt
import fire
from contextlib import contextmanager
from time import perf_counter
from typing import Tuple
def _basic_setup(
dtype: str,
query_vectors_file: str
) -> Tuple[dap.VectorDType, npt.NDArray[dap.VectorDType]]:
_dtype = dap.valid_dtype(dtype)
vectors_to_query = dap.vectors_from_binary(query_vectors_file, dtype=_dtype)
return _dtype, vectors_to_query
def dynamic(
dtype: str,
index_vectors_file: str,
query_vectors_file: str,
build_complexity: int,
graph_degree: int,
K: int,
search_complexity: int,
num_insert_threads: int,
num_search_threads: int,
gt_file: str = "",
):
_dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
vectors_to_index = dap.vectors_from_binary(index_vectors_file, dtype=_dtype)
npts, ndims = vectors_to_index.shape
index = dap.DynamicMemoryIndex(
"l2", _dtype, ndims, npts, build_complexity, graph_degree
)
tags = np.arange(1, npts+1, dtype=np.uintc)
timer = Timer()
with timer.time("batch insert"):
index.batch_insert(vectors_to_index, tags, num_insert_threads)
delete_tags = np.random.choice(
np.array(range(1, npts + 1, 1), dtype=np.uintc),
size=int(0.5 * npts),
replace=False
)
with timer.time("mark deletion"):
for tag in delete_tags:
index.mark_deleted(tag)
with timer.time("consolidation"):
index.consolidate_delete()
deleted_data = vectors_to_index[delete_tags - 1, :]
with timer.time("re-insertion"):
index.batch_insert(deleted_data, delete_tags, num_insert_threads)
with timer.time("batch searched"):
tags, dists = index.batch_search(vectors_to_query, K, search_complexity, num_search_threads)
# res_ids = tags - 1
# if gt_file != "":
# recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
# print(f"recall@{K} is {recall}")
def static(
dtype: str,
index_directory: str,
index_vectors_file: str,
query_vectors_file: str,
build_complexity: int,
graph_degree: int,
K: int,
search_complexity: int,
num_threads: int,
gt_file: str = "",
index_prefix: str = "ann"
):
_dtype, vectors_to_query = _basic_setup(dtype, query_vectors_file)
timer = Timer()
with timer.time("build static index"):
# build index
dap.build_memory_index(
data=index_vectors_file,
metric="l2",
vector_dtype=_dtype,
index_directory=index_directory,
complexity=build_complexity,
graph_degree=graph_degree,
num_threads=num_threads,
index_prefix=index_prefix,
alpha=1.2,
use_pq_build=False,
num_pq_bytes=8,
use_opq=False,
)
with timer.time("load static index"):
# ready search object
index = dap.StaticMemoryIndex(
metric="l2",
vector_dtype=_dtype,
data_path=index_vectors_file,
index_directory=index_directory,
num_threads=num_threads, # this can be different at search time if you would like
initial_search_complexity=search_complexity,
index_prefix=index_prefix
)
ids, dists = index.batch_search(vectors_to_query, K, search_complexity, num_threads)
# if gt_file != "":
# recall = utils.calculate_recall_from_gt_file(K, ids, gt_file)
# print(f"recall@{K} is {recall}")
def dynamic_clustered():
pass
def generate_clusters():
pass
class Timer:
def __init__(self):
self._start = -1
@contextmanager
def time(self, message: str):
start = perf_counter()
if self._start == -1:
self._start = start
yield
now = perf_counter()
print(f"Operation {message} completed in {(now - start):.3f}s, total: {(now - self._start):.3f}s")
if __name__ == "__main__":
fire.Fire({
"in-mem-dynamic": dynamic,
"in-mem-static": static,
"in-mem-dynamic-clustered": dynamic_clustered,
"generate-clusters": generate_clusters
}, name="cli")

View File

@@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import argparse
import utils
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="cluster", description="kmeans cluster points in a file"
)
parser.add_argument("-d", "--data_type", required=True)
parser.add_argument("-i", "--indexdata_file", required=True)
parser.add_argument("-k", "--num_clusters", type=int, required=True)
args = parser.parse_args()
npts, ndims = get_bin_metadata(indexdata_file)
data = utils.bin_to_numpy(args.data_type, args.indexdata_file)
offsets, permutation = utils.cluster_and_permute(
args.data_type, npts, ndims, data, args.num_clusters
)
permuted_data = data[permutation]
utils.numpy_to_bin(permuted_data, args.indexdata_file + ".cluster")

View File

@@ -0,0 +1,161 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import argparse
import diskannpy
import numpy as np
import utils
def insert_and_search(
dtype_str,
indexdata_file,
querydata_file,
Lb,
graph_degree,
K,
Ls,
num_insert_threads,
num_search_threads,
gt_file,
) -> dict[str, float]:
"""
:param dtype_str:
:param indexdata_file:
:param querydata_file:
:param Lb:
:param graph_degree:
:param K:
:param Ls:
:param num_insert_threads:
:param num_search_threads:
:param gt_file:
:return: Dictionary of timings. Key is the event and value is the number of seconds the event took
"""
timer_results: dict[str, float] = {}
method_timer: utils.Timer = utils.Timer()
npts, ndims = utils.get_bin_metadata(indexdata_file)
if dtype_str == "float":
dtype = np.float32
elif dtype_str == "int8":
dtype = np.int8
elif dtype_str == "uint8":
dtype = np.uint8
else:
raise ValueError("data_type must be float, int8 or uint8")
index = diskannpy.DynamicMemoryIndex(
distance_metric="l2",
vector_dtype=dtype,
dimensions=ndims,
max_vectors=npts,
complexity=Lb,
graph_degree=graph_degree
)
queries = diskannpy.vectors_from_file(querydata_file, dtype)
data = diskannpy.vectors_from_file(indexdata_file, dtype)
tags = np.zeros(npts, dtype=np.uintc)
timer = utils.Timer()
for i in range(npts):
tags[i] = i + 1
index.batch_insert(data, tags, num_insert_threads)
compute_seconds = timer.elapsed()
print('batch_insert complete in', compute_seconds, 's')
timer_results["batch_insert_seconds"] = compute_seconds
delete_tags = np.random.choice(
np.array(range(1, npts + 1, 1), dtype=np.uintc),
size=int(0.5 * npts),
replace=False
)
timer.reset()
for tag in delete_tags:
index.mark_deleted(tag)
compute_seconds = timer.elapsed()
timer_results['mark_deletion_seconds'] = compute_seconds
print('mark deletion completed in', compute_seconds, 's')
timer.reset()
index.consolidate_delete()
compute_seconds = timer.elapsed()
print('consolidation completed in', compute_seconds, 's')
timer_results['consolidation_completed_seconds'] = compute_seconds
deleted_data = data[delete_tags - 1, :]
timer.reset()
index.batch_insert(deleted_data, delete_tags, num_insert_threads)
compute_seconds = timer.elapsed()
print('re-insertion completed in', compute_seconds, 's')
timer_results['re-insertion_seconds'] = compute_seconds
timer.reset()
tags, dists = index.batch_search(queries, K, Ls, num_search_threads)
compute_seconds = timer.elapsed()
print('Batch searched', queries.shape[0], ' queries in ', compute_seconds, 's')
timer_results['batch_searched_seconds'] = compute_seconds
res_ids = tags - 1
if gt_file != "":
timer.reset()
recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
print(f"recall@{K} is {recall}")
timer_results['recall_computed_seconds'] = timer.elapsed()
timer_results['total_time_seconds'] = method_timer.elapsed()
return timer_results
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="in-mem-dynamic",
description="Inserts points dynamically in a clustered order and search from vectors in a file.",
)
parser.add_argument("-d", "--data_type", required=True)
parser.add_argument("-i", "--indexdata_file", required=True)
parser.add_argument("-q", "--querydata_file", required=True)
parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
parser.add_argument("-R", "--graph_degree", default=32, type=int)
parser.add_argument("-TI", "--num_insert_threads", default=8, type=int)
parser.add_argument("-TS", "--num_search_threads", default=8, type=int)
parser.add_argument("-K", default=10, type=int)
parser.add_argument("--gt_file", default="")
parser.add_argument("--json_timings_output", required=False, default=None, help="File to write out timings to as JSON. If not specified, timings will not be written out.")
args = parser.parse_args()
timings = insert_and_search(
args.data_type,
args.indexdata_file,
args.querydata_file,
args.Lbuild,
args.graph_degree, # Build args
args.K,
args.Lsearch,
args.num_insert_threads,
args.num_search_threads, # search args
args.gt_file,
)
if args.json_timings_output is not None:
import json
timings['log_file'] = args.json_timings_output
with open(args.json_timings_output, "w") as f:
json.dump(timings, f)
"""
An ingest optimized example with SIFT1M
source venv/bin/activate
python python/apps/in-mem-dynamic.py -d float \
-i "$HOME/data/sift/sift_base.fbin" -q "$HOME/data/sift/sift_query.fbin" --gt_file "$HOME/data/sift/gt100_base" \
-Lb 10 -R 30 -Ls 200
"""

View File

@@ -0,0 +1,149 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import argparse
from xml.dom.pulldom import default_bufsize
import diskannpy
import numpy as np
import utils
def build_and_search(
metric,
dtype_str,
index_directory,
indexdata_file,
querydata_file,
Lb,
graph_degree,
K,
Ls,
num_threads,
gt_file,
index_prefix,
search_only
) -> dict[str, float]:
"""
:param metric:
:param dtype_str:
:param index_directory:
:param indexdata_file:
:param querydata_file:
:param Lb:
:param graph_degree:
:param K:
:param Ls:
:param num_threads:
:param gt_file:
:param index_prefix:
:param search_only:
:return: Dictionary of timings. Key is the event and value is the number of seconds the event took
in wall-clock-time.
"""
timer_results: dict[str, float] = {}
method_timer: utils.Timer = utils.Timer()
if dtype_str == "float":
dtype = np.single
elif dtype_str == "int8":
dtype = np.byte
elif dtype_str == "uint8":
dtype = np.ubyte
else:
raise ValueError("data_type must be float, int8 or uint8")
# build index
if not search_only:
build_index_timer = utils.Timer()
diskannpy.build_memory_index(
data=indexdata_file,
distance_metric=metric,
vector_dtype=dtype,
index_directory=index_directory,
complexity=Lb,
graph_degree=graph_degree,
num_threads=num_threads,
index_prefix=index_prefix,
alpha=1.2,
use_pq_build=False,
num_pq_bytes=8,
use_opq=False,
)
timer_results["build_index_seconds"] = build_index_timer.elapsed()
# ready search object
load_index_timer = utils.Timer()
index = diskannpy.StaticMemoryIndex(
distance_metric=metric,
vector_dtype=dtype,
index_directory=index_directory,
num_threads=num_threads, # this can be different at search time if you would like
initial_search_complexity=Ls,
index_prefix=index_prefix
)
timer_results["load_index_seconds"] = load_index_timer.elapsed()
queries = utils.bin_to_numpy(dtype, querydata_file)
query_timer = utils.Timer()
ids, dists = index.batch_search(queries, 10, Ls, num_threads)
query_time = query_timer.elapsed()
qps = round(queries.shape[0]/query_time, 1)
print('Batch searched', queries.shape[0], 'in', query_time, 's @', qps, 'QPS')
timer_results["query_seconds"] = query_time
if gt_file != "":
recall_timer = utils.Timer()
recall = utils.calculate_recall_from_gt_file(K, ids, gt_file)
print(f"recall@{K} is {recall}")
timer_results["recall_seconds"] = recall_timer.elapsed()
timer_results['total_time_seconds'] = method_timer.elapsed()
return timer_results
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="in-mem-static",
description="Static in-memory build and search from vectors in a file",
)
parser.add_argument("-m", "--metric", required=False, default="l2")
parser.add_argument("-d", "--data_type", required=True)
parser.add_argument("-id", "--index_directory", required=False, default=".")
parser.add_argument("-i", "--indexdata_file", required=True)
parser.add_argument("-q", "--querydata_file", required=True)
parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
parser.add_argument("-R", "--graph_degree", default=32, type=int)
parser.add_argument("-T", "--num_threads", default=8, type=int)
parser.add_argument("-K", default=10, type=int)
parser.add_argument("-G", "--gt_file", default="")
parser.add_argument("-ip", "--index_prefix", required=False, default="ann")
parser.add_argument("--search_only", required=False, default=False)
parser.add_argument("--json_timings_output", required=False, default=None, help="File to write out timings to as JSON. If not specified, timings will not be written out.")
args = parser.parse_args()
timings: dict[str, float] = build_and_search(
args.metric,
args.data_type,
args.index_directory.strip(),
args.indexdata_file.strip(),
args.querydata_file.strip(),
args.Lbuild,
args.graph_degree, # Build args
args.K,
args.Lsearch,
args.num_threads, # search args
args.gt_file,
args.index_prefix,
args.search_only
)
if args.json_timings_output is not None:
import json
timings['log_file'] = args.json_timings_output
with open(args.json_timings_output, "w") as f:
json.dump(timings, f)

View File

@@ -0,0 +1,103 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import argparse
import diskannpy
import numpy as np
import utils
def insert_and_search(
dtype_str,
indexdata_file,
querydata_file,
Lb,
graph_degree,
num_clusters,
num_insert_threads,
K,
Ls,
num_search_threads,
gt_file,
):
npts, ndims = utils.get_bin_metadata(indexdata_file)
if dtype_str == "float":
dtype = np.float32
elif dtype_str == "int8":
dtype = np.int8
elif dtype_str == "uint8":
dtype = np.uint8
else:
raise ValueError("data_type must be float, int8 or uint8")
index = diskannpy.DynamicMemoryIndex(
distance_metric="l2",
vector_dtype=dtype,
dimensions=ndims,
max_vectors=npts,
complexity=Lb,
graph_degree=graph_degree
)
queries = diskannpy.vectors_from_file(querydata_file, dtype)
data = diskannpy.vectors_from_file(indexdata_file, dtype)
offsets, permutation = utils.cluster_and_permute(
dtype_str, npts, ndims, data, num_clusters
)
i = 0
timer = utils.Timer()
for c in range(num_clusters):
cluster_index_range = range(offsets[c], offsets[c + 1])
cluster_indices = np.array(permutation[cluster_index_range], dtype=np.uint32)
cluster_data = data[cluster_indices, :]
index.batch_insert(cluster_data, cluster_indices + 1, num_insert_threads)
print('Inserted cluster', c, 'in', timer.elapsed(), 's')
tags, dists = index.batch_search(queries, K, Ls, num_search_threads)
print('Batch searched', queries.shape[0], 'queries in', timer.elapsed(), 's')
res_ids = tags - 1
if gt_file != "":
recall = utils.calculate_recall_from_gt_file(K, res_ids, gt_file)
print(f"recall@{K} is {recall}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="in-mem-dynamic",
description="Inserts points dynamically in a clustered order and search from vectors in a file.",
)
parser.add_argument("-d", "--data_type", required=True)
parser.add_argument("-i", "--indexdata_file", required=True)
parser.add_argument("-q", "--querydata_file", required=True)
parser.add_argument("-Lb", "--Lbuild", default=50, type=int)
parser.add_argument("-Ls", "--Lsearch", default=50, type=int)
parser.add_argument("-R", "--graph_degree", default=32, type=int)
parser.add_argument("-TI", "--num_insert_threads", default=8, type=int)
parser.add_argument("-TS", "--num_search_threads", default=8, type=int)
parser.add_argument("-C", "--num_clusters", default=32, type=int)
parser.add_argument("-K", default=10, type=int)
parser.add_argument("--gt_file", default="")
args = parser.parse_args()
insert_and_search(
args.data_type,
args.indexdata_file,
args.querydata_file,
args.Lbuild,
args.graph_degree, # Build args
args.num_clusters,
args.num_insert_threads,
args.K,
args.Lsearch,
args.num_search_threads, # search args
args.gt_file,
)
# An ingest optimized example with SIFT1M
# python3 ~/DiskANN/python/apps/insert-in-clustered-order.py -d float \
# -i sift_base.fbin -q sift_query.fbin --gt_file gt100_base \
# -Lb 10 -R 30 -Ls 200 -C 32

View File

@@ -0,0 +1,120 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import numpy as np
from scipy.cluster.vq import vq, kmeans2
from typing import Tuple
from time import perf_counter
def get_bin_metadata(bin_file) -> Tuple[int, int]:
array = np.fromfile(file=bin_file, dtype=np.uint32, count=2)
return array[0], array[1]
def bin_to_numpy(dtype, bin_file) -> np.ndarray:
npts, ndims = get_bin_metadata(bin_file)
return np.fromfile(file=bin_file, dtype=dtype, offset=8).reshape(npts, ndims)
class Timer:
last = perf_counter()
def reset(self):
new = perf_counter()
self.last = new
def elapsed(self, round_digit:int = 3):
new = perf_counter()
elapsed_time = new - self.last
self.last = new
return round(elapsed_time, round_digit)
def numpy_to_bin(array, out_file):
shape = np.shape(array)
npts = shape[0].astype(np.uint32)
ndims = shape[1].astype(np.uint32)
f = open(out_file, "wb")
f.write(npts.tobytes())
f.write(ndims.tobytes())
f.write(array.tobytes())
f.close()
def read_gt_file(gt_file) -> Tuple[np.ndarray[int], np.ndarray[float]]:
"""
Return ids and distances to queries
"""
nq, K = get_bin_metadata(gt_file)
ids = np.fromfile(file=gt_file, dtype=np.uint32, offset=8, count=nq * K).reshape(
nq, K
)
dists = np.fromfile(
file=gt_file, dtype=np.float32, offset=8 + nq * K * 4, count=nq * K
).reshape(nq, K)
return ids, dists
def calculate_recall(
result_set_indices: np.ndarray[int],
truth_set_indices: np.ndarray[int],
recall_at: int = 5,
) -> float:
"""
result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
:param result_set_indices:
:param truth_set_indices:
:param recall_at:
:return:
"""
found = 0
for i in range(0, result_set_indices.shape[0]):
result_set_set = set(result_set_indices[i][0:recall_at])
truth_set_set = set(truth_set_indices[i][0:recall_at])
found += len(result_set_set.intersection(truth_set_set))
return found / (result_set_indices.shape[0] * recall_at)
def calculate_recall_from_gt_file(K: int, ids: np.ndarray[int], gt_file: str) -> float:
"""
Calculate recall from ids returned from search and those read from file
"""
gt_ids, gt_dists = read_gt_file(gt_file)
return calculate_recall(ids, gt_ids, K)
def cluster_and_permute(
dtype_str, npts, ndims, data, num_clusters
) -> Tuple[np.ndarray[int], np.ndarray[int]]:
"""
Cluster the data and return permutation of row indices
that would group indices of the same cluster together
"""
sample_size = min(100000, npts)
sample_indices = np.random.choice(range(npts), size=sample_size, replace=False)
sampled_data = data[sample_indices, :]
centroids, sample_labels = kmeans2(sampled_data, num_clusters, minit="++", iter=10)
labels, dist = vq(data, centroids)
count = np.zeros(num_clusters)
for i in range(npts):
count[labels[i]] += 1
print("Cluster counts")
print(count)
offsets = np.zeros(num_clusters + 1, dtype=int)
for i in range(0, num_clusters, 1):
offsets[i + 1] = offsets[i] + count[i]
permutation = np.zeros(npts, dtype=int)
counters = np.zeros(num_clusters, dtype=int)
for i in range(npts):
label = labels[i]
row = offsets[label] + counters[label]
counters[label] += 1
permutation[row] = i
return offsets, permutation

View File

@@ -0,0 +1,27 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include "common.h"
#include "distance.h"
namespace diskannpy
{
template <typename DT>
void build_disk_index(diskann::Metric metric, const std::string &data_file_path, const std::string &index_prefix_path,
uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit,
double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes,
const std::string &codebook_prefix);
template <typename DT, typename TagT = DynamicIdType, typename LabelT = filterT>
void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path,
const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity, float alpha,
uint32_t num_threads, bool use_pq_build, size_t num_pq_bytes, bool use_opq,
bool use_tags = false, const std::string &filter_labels_file = "",
const std::string &universal_label = "", uint32_t filter_complexity = 0);
} // namespace diskannpy

View File

@@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <stdint.h>
#include <utility>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
namespace diskannpy
{
typedef uint32_t filterT;
typedef uint32_t StaticIdType;
typedef uint32_t DynamicIdType;
template <class IdType> using NeighborsAndDistances = std::pair<py::array_t<IdType>, py::array_t<float>>;
}; // namespace diskannpy

View File

@@ -0,0 +1,53 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "common.h"
#include "index.h"
#include "parameters.h"
namespace py = pybind11;
namespace diskannpy
{
template <typename DT>
class DynamicMemoryIndex
{
public:
DynamicMemoryIndex(diskann::Metric m, size_t dimensions, size_t max_vectors, uint32_t complexity,
uint32_t graph_degree, bool saturate_graph, uint32_t max_occlusion_size, float alpha,
uint32_t num_threads, uint32_t filter_complexity, uint32_t num_frozen_points,
uint32_t initial_search_complexity, uint32_t initial_search_threads,
bool concurrent_consolidation);
void load(const std::string &index_path);
int insert(const py::array_t<DT, py::array::c_style | py::array::forcecast> &vector, DynamicIdType id);
py::array_t<int> batch_insert(py::array_t<DT, py::array::c_style | py::array::forcecast> &vectors,
py::array_t<DynamicIdType, py::array::c_style | py::array::forcecast> &ids, int32_t num_inserts,
int num_threads = 0);
int mark_deleted(DynamicIdType id);
void save(const std::string &save_path, bool compact_before_save = false);
NeighborsAndDistances<DynamicIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn,
uint64_t complexity);
NeighborsAndDistances<DynamicIdType> batch_search(py::array_t<DT, py::array::c_style | py::array::forcecast> &queries,
uint64_t num_queries, uint64_t knn, uint64_t complexity,
uint32_t num_threads);
void consolidate_delete();
size_t num_points();
private:
const uint32_t _initial_search_complexity;
const diskann::IndexWriteParameters _write_parameters;
diskann::Index<DT, DynamicIdType, filterT> _index;
};
}; // namespace diskannpy

View File

@@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#ifdef _WINDOWS
#include "windows_aligned_file_reader.h"
#elif __APPLE__
#include "apple_aligned_file_reader.h"
#else
#include "linux_aligned_file_reader.h"
#endif
#include "common.h"
#include "pq_flash_index.h"
namespace py = pybind11;
namespace diskannpy
{
#ifdef _WINDOWS
typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader;
#elif __APPLE__
typedef AppleAlignedFileReader PlatformSpecificAlignedFileReader;
#else
typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader;
#endif
template <typename DT> class StaticDiskIndex
{
public:
StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads,
size_t num_nodes_to_cache, uint32_t cache_mechanism, const std::string &pq_prefix,
const std::string &partition_prefix);
void cache_bfs_levels(size_t num_nodes_to_cache);
void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity, uint64_t beam_width,
bool USE_DEFERRED_FETCH = false, bool skip_search_reorder = false,
bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
float prune_ratio = 0, bool batch_recompute = false,
bool global_pruning = false);
NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint64_t beam_width, uint32_t num_threads, bool USE_DEFERRED_FETCH = false,
bool skip_search_reorder = false, bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
float prune_ratio = 0, bool batch_recompute = false, bool global_pruning = false);
private:
std::shared_ptr<AlignedFileReader> _reader;
std::shared_ptr<AlignedFileReader> _graph_reader;
diskann::PQFlashIndex<DT> _index;
};
} // namespace diskannpy

View File

@@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "common.h"
#include "index.h"
namespace py = pybind11;
namespace diskannpy
{
template <typename DT> class StaticMemoryIndex
{
public:
StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, size_t dimensions,
uint32_t num_threads, uint32_t initial_search_complexity);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity);
NeighborsAndDistances<StaticIdType> search_with_filter(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn, uint64_t complexity,
filterT filter);
NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint32_t num_threads);
private:
diskann::Index<DT, StaticIdType, filterT> _index;
};
} // namespace diskannpy

View File

@@ -0,0 +1,138 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
"""
# Documentation Overview
`diskannpy` is mostly structured around 2 distinct processes: [Index Builder Functions](#index-builders) and [Search Classes](#search-classes)
It also includes a few nascent [utilities](#utilities).
And lastly, it makes substantial use of type hints, with various shorthand [type aliases](#parameter-and-response-type-aliases) documented.
When reading the `diskannpy` code we refer to the type aliases, though `pdoc` helpfully expands them.
## Index Builders
- `build_disk_index` - To build an index that cannot fully fit into memory when searching
- `build_memory_index` - To build an index that can fully fit into memory when searching
## Search Classes
- `StaticMemoryIndex` - for indices that can fully fit in memory and won't be changed during the search operations
- `StaticDiskIndex` - for indices that cannot fully fit in memory, thus relying on disk IO to search, and also won't be changed during search operations
- `DynamicMemoryIndex` - for indices that can fully fit in memory and will be mutated via insert/deletion operations as well as search operations
## Parameter Defaults
- `diskannpy.defaults` - Default values exported from the C++ extension for Python users
## Parameter and Response Type Aliases
- `DistanceMetric` - What distance metrics does `diskannpy` support?
- `VectorDType` - What vector datatypes does `diskannpy` support?
- `QueryResponse` - What can I expect as a response to my search?
- `QueryResponseBatch` - What can I expect as a response to my batch search?
- `VectorIdentifier` - What types do `diskannpy` support as vector identifiers?
- `VectorIdentifierBatch` - A batch of identifiers of the exact same type. The type can change, but they must **all** change.
- `VectorLike` - How does a vector look to `diskannpy`, to be inserted or searched with.
- `VectorLikeBatch` - A batch of those vectors, to be inserted or searched with.
- `Metadata` - DiskANN vector binary file metadata (num_points, vector_dim)
## Utilities
- `vectors_to_file` - Turns a 2 dimensional `numpy.typing.NDArray[VectorDType]` with shape `(number_of_points, vector_dim)` into a DiskANN vector bin file.
- `vectors_from_file` - Reads a DiskANN vector bin file representing stored vectors into a numpy ndarray.
- `vectors_metadata_from_file` - Reads metadata stored in a DiskANN vector bin file without reading the entire file
- `tags_to_file` - Turns a 1 dimensional `numpy.typing.NDArray[VectorIdentifier]` into a DiskANN tags bin file.
- `tags_from_file` - Reads a DiskANN tags bin file representing stored tags into a numpy ndarray.
- `valid_dtype` - Checks if a given vector dtype is supported by `diskannpy`
"""
from typing import Any, Literal, NamedTuple, Type, Union
import numpy as np
from numpy import typing as npt
DistanceMetric = Literal["l2", "mips", "cosine"]
""" Type alias for one of {"l2", "mips", "cosine"} """
VectorDType = Union[Type[np.float32], Type[np.int8], Type[np.uint8]]
""" Type alias for one of {`numpy.float32`, `numpy.int8`, `numpy.uint8`} """
VectorLike = npt.NDArray[VectorDType]
""" Type alias for something that can be treated as a vector """
VectorLikeBatch = npt.NDArray[VectorDType]
""" Type alias for a batch of VectorLikes """
VectorIdentifier = np.uint32
"""
Type alias for a vector identifier, whether it be an implicit array index identifier from StaticMemoryIndex or
StaticDiskIndex, or an explicit tag identifier from DynamicMemoryIndex
"""
VectorIdentifierBatch = npt.NDArray[np.uint32]
""" Type alias for a batch of VectorIdentifiers """
class QueryResponse(NamedTuple):
"""
Tuple with two values, identifiers and distances. Both are 1d arrays, positionally correspond, and will contain the
nearest neighbors from [0..k_neighbors)
"""
identifiers: npt.NDArray[VectorIdentifier]
""" A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 1 dimensional """
distances: npt.NDArray[np.float32]
"""
A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 1 dimensional
"""
class QueryResponseBatch(NamedTuple):
"""
Tuple with two values, identifiers and distances. Both are 2d arrays, with dimensionality determined by the
rows corresponding to the number of queries made, and the columns corresponding to the k neighbors
requested. The two 2d arrays have an implicit, position-based relationship
"""
identifiers: npt.NDArray[VectorIdentifier]
"""
A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 2 dimensional. The row corresponds to index
of the query, and the column corresponds to the k neighbors requested
"""
distances: np.ndarray[np.float32]
"""
A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 2 dimensional.
The row corresponds to the index of the query, and the column corresponds to the distance of the query to the
*k-th* neighbor
"""
from . import defaults
from ._builder import build_disk_index, build_memory_index
from ._common import valid_dtype
from ._dynamic_memory_index import DynamicMemoryIndex
from ._files import (
Metadata,
tags_from_file,
tags_to_file,
vectors_from_file,
vectors_metadata_from_file,
vectors_to_file,
)
from ._static_disk_index import StaticDiskIndex
from ._static_memory_index import StaticMemoryIndex
__all__ = [
"build_disk_index",
"build_memory_index",
"StaticDiskIndex",
"StaticMemoryIndex",
"DynamicMemoryIndex",
"defaults",
"DistanceMetric",
"VectorDType",
"QueryResponse",
"QueryResponseBatch",
"VectorIdentifier",
"VectorIdentifierBatch",
"VectorLike",
"VectorLikeBatch",
"Metadata",
"vectors_metadata_from_file",
"vectors_to_file",
"vectors_from_file",
"tags_to_file",
"tags_from_file",
"valid_dtype",
]

View File

@@ -0,0 +1,349 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import json
import os
import shutil
from pathlib import Path
from typing import Optional, Tuple, Union
import numpy as np
from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch
from . import _diskannpy as _native_dap
from ._common import (
_assert,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_valid_metric,
_write_index_metadata,
valid_dtype,
)
from ._diskannpy import defaults
from ._files import tags_to_file, vectors_metadata_from_file, vectors_to_file
def _valid_path_and_dtype(
data: Union[str, VectorLikeBatch],
vector_dtype: VectorDType,
index_path: str,
index_prefix: str,
) -> Tuple[str, VectorDType]:
if isinstance(data, str):
vector_bin_path = data
_assert(
Path(data).exists() and Path(data).is_file(),
"if data is of type `str`, it must both exist and be a file",
)
vector_dtype_actual = valid_dtype(vector_dtype)
else:
vector_bin_path = os.path.join(index_path, f"{index_prefix}_vectors.bin")
# if Path(vector_bin_path).exists():
# raise ValueError(
# f"The path {vector_bin_path} already exists. Remove it and try again."
# )
vector_dtype_actual = valid_dtype(data.dtype)
# vectors_to_file(vector_file=vector_bin_path, vectors=data)
return vector_bin_path, vector_dtype_actual
def build_disk_index(
data: Union[str, VectorLikeBatch],
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
search_memory_maximum: float,
build_memory_maximum: float,
num_threads: int,
pq_disk_bytes: int = defaults.PQ_DISK_BYTES,
vector_dtype: Optional[VectorDType] = None,
index_prefix: str = "ann",
codebook_prefix: str = "",
) -> None:
"""
This function will construct a DiskANN disk index. Disk indices are ideal for very large datasets that
are too large to fit in memory. Memory is still used, but it is primarily used to provide precise disk
locations for fast retrieval of smaller subsets of the index without compromising much on recall.
If you provide a numpy array, it will save this array to disk in a temp location
in the format DiskANN's PQ Flash Index builder requires. This temp folder is deleted upon index creation completion
or error.
## Distance Metric and Vector Datatype Restrictions
| Metric \ Datatype | np.float32 | np.uint8 | np.int8 |
|-------------------|------------|----------|---------|
| L2 | ✅ | ✅ | ✅ |
| MIPS | ✅ | ❌ | ❌ |
| Cosine [^bug-in-disk-cosine] | ❌ | ❌ | ❌ |
[^bug-in-disk-cosine]: For StaticDiskIndex, Cosine distances are not currently supported.
### Parameters
- **data**: Either a `str` representing a path to a DiskANN vector bin file, or a numpy.ndarray,
of a supported dtype, in 2 dimensions. Note that `vector_dtype` must be provided if data is a `str`
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats.
- **index_directory**: The index files will be saved to this **existing** directory path
- **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75
and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall
for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared
to compromise on quality
- **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will
result in larger indices and longer indexing times, but better search quality.
- **search_memory_maximum**: Build index with the expectation that the search will use at most
`search_memory_maximum`, in gb.
- **build_memory_maximum**: Build index using at most `build_memory_maximum` in gb. Building processes typically
require more memory, while search memory can be reduced.
- **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available
logical processors should be used.
- **pq_disk_bytes**: Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100%
recall. If your vectors are too large to store in SSD, this parameter provides the option to compress the
vectors using PQ for storing on SSD. This will trade off recall. You would also want this to be greater
than the number of bytes used for the PQ compressed data stored in-memory. Default is `0`.
- **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
"""
_assert(
(isinstance(data, str) and vector_dtype is not None)
or isinstance(data, np.ndarray),
"vector_dtype is required if data is a str representing a path to the vector bin file",
)
dap_metric = _valid_metric(distance_metric)
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_positive_uint32(graph_degree, "graph_degree")
_assert(search_memory_maximum > 0, "search_memory_maximum must be larger than 0")
_assert(build_memory_maximum > 0, "build_memory_maximum must be larger than 0")
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_nonnegative_uint32(pq_disk_bytes, "pq_disk_bytes")
_assert(index_prefix != "", "index_prefix cannot be an empty string")
index_path = Path(index_directory)
_assert(
index_path.exists() and index_path.is_dir(),
"index_directory must both exist and be a directory",
)
vector_bin_path, vector_dtype_actual = _valid_path_and_dtype(
data, vector_dtype, index_directory, index_prefix
)
_assert(dap_metric != _native_dap.COSINE, "Cosine is currently not supported in StaticDiskIndex")
if dap_metric == _native_dap.INNER_PRODUCT:
_assert(
vector_dtype_actual == np.float32,
"Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips"
)
num_points, dimensions = vectors_metadata_from_file(vector_bin_path)
if vector_dtype_actual == np.uint8:
_builder = _native_dap.build_disk_uint8_index
elif vector_dtype_actual == np.int8:
_builder = _native_dap.build_disk_int8_index
else:
_builder = _native_dap.build_disk_float_index
index_prefix_path = os.path.join(index_directory, index_prefix)
_builder(
distance_metric=dap_metric,
data_file_path=vector_bin_path,
index_prefix_path=index_prefix_path,
complexity=complexity,
graph_degree=graph_degree,
final_index_ram_limit=search_memory_maximum,
indexing_ram_budget=build_memory_maximum,
num_threads=num_threads,
pq_disk_bytes=pq_disk_bytes,
codebook_prefix=codebook_prefix,
)
_write_index_metadata(
index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions
)
def build_memory_index(
data: Union[str, VectorLikeBatch],
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
num_threads: int,
alpha: float = defaults.ALPHA,
use_pq_build: bool = defaults.USE_PQ_BUILD,
num_pq_bytes: int = defaults.NUM_PQ_BYTES,
use_opq: bool = defaults.USE_OPQ,
vector_dtype: Optional[VectorDType] = None,
tags: Union[str, VectorIdentifierBatch] = "",
filter_labels: Optional[list[list[str]]] = None,
universal_label: str = "",
filter_complexity: int = defaults.FILTER_COMPLEXITY,
index_prefix: str = "ann",
) -> None:
"""
This function will construct a DiskANN memory index. Memory indices are ideal for smaller datasets whose
indices can fit into memory. Memory indices are faster than disk indices, but usually cannot scale to massive
sizes in an individual index on an individual machine.
`diskannpy`'s memory indices take two forms: a `diskannpy.StaticMemoryIndex`, which will not be mutated, only
searched upon, and a `diskannpy.DynamicMemoryIndex`, which can be mutated AND searched upon in the same process.
## Important Note:
You **must** determine the type of index you are building for. If you are building for a
`diskannpy.DynamicMemoryIndex`, you **must** supply a valid value for the `tags` parameter. **Do not supply
tags if the index is intended to be `diskannpy.StaticMemoryIndex`**!
## Distance Metric and Vector Datatype Restrictions
| Metric \ Datatype | np.float32 | np.uint8 | np.int8 |
|-------------------|------------|----------|---------|
| L2 | ✅ | ✅ | ✅ |
| MIPS | ✅ | ❌ | ❌ |
| Cosine | ✅ | ✅ | ✅ |
### Parameters
- **data**: Either a `str` representing a path to an existing DiskANN vector bin file, or a numpy.ndarray of a
supported dtype in 2 dimensions. Note that `vector_dtype` must be provided if `data` is a `str`.
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats.
- **index_directory**: The index files will be saved to this **existing** directory path
- **complexity**: The size of the candidate nearest neighbor list to use when building the index. Values between 75
and 200 are typical. Larger values will take more time to build but result in indices that provide higher recall
for the same search complexity. Use a value that is at least as large as `graph_degree` unless you are prepared
to compromise on quality
- **graph_degree**: The degree of the graph index, typically between 60 and 150. A larger maximum degree will
result in larger indices and longer indexing times, but better search quality.
- **num_threads**: Number of threads to use when creating this index. `0` is used to indicate all available
logical processors should be used.
- **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably more
distance comparisons compared to a lower alpha value.
- **use_pq_build**: Use product quantization during build. Product quantization is a lossy compression technique
that can reduce the size of the index on disk. This will trade off recall. Default is `True`.
- **num_pq_bytes**: The number of bytes used to store the PQ compressed data in memory. This will trade off recall.
Default is `0`.
- **use_opq**: Use optimized product quantization during build.
- **vector_dtype**: Required if the provided `data` is of type `str`, else we use the `data.dtype` if np array.
- **tags**: Tags can be defined either as a path on disk to an existing .tags file, or provided as a np.array of
the same length as the number of vectors. Tags are used to identify vectors in the index via your *own*
numbering conventions, and is absolutely required for loading DynamicMemoryIndex indices `from_file`.
- **filter_labels**: An optional, but exhaustive list of categories for each vector. This is used to filter
search results by category. If provided, this must be a list of lists, where each inner list is a list of
categories for the corresponding vector. For example, if you have 3 vectors, and the first vector belongs to
categories "a" and "b", the second vector belongs to category "b", and the third vector belongs to no categories,
you would provide `filter_labels=[["a", "b"], ["b"], []]`. If you do not want to provide categories for a
particular vector, you can provide an empty list. If you do not want to provide categories for any vectors,
you can provide `None` for this parameter (which is the default)
- **universal_label**: An optional label that indicates that this vector should be included in *every* search
in which it also meets the knn search criteria.
- **filter_complexity**: Complexity to use when using filters. Default is 0. 0 is strictly invalid if you are
using filters.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
"""
_assert(
(isinstance(data, str) and vector_dtype is not None)
or isinstance(data, np.ndarray),
"vector_dtype is required if data is a str representing a path to the vector bin file",
)
dap_metric = _valid_metric(distance_metric)
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_positive_uint32(graph_degree, "graph_degree")
_assert(
alpha >= 1,
"alpha must be >= 1, and realistically should be kept between [1.0, 2.0)",
)
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_nonnegative_uint32(num_pq_bytes, "num_pq_bytes")
_assert_is_nonnegative_uint32(filter_complexity, "filter_complexity")
_assert(index_prefix != "", "index_prefix cannot be an empty string")
_assert(
filter_labels is None or filter_complexity > 0,
"if filter_labels is provided, filter_complexity must not be 0"
)
index_path = Path(index_directory)
_assert(
index_path.exists() and index_path.is_dir(),
"index_directory must both exist and be a directory",
)
vector_bin_path, vector_dtype_actual = _valid_path_and_dtype(
data, vector_dtype, index_directory, index_prefix
)
if dap_metric == _native_dap.INNER_PRODUCT:
_assert(
vector_dtype_actual == np.float32,
"Integral vector dtypes (np.uint8, np.int8) are not supported with distance metric mips"
)
num_points, dimensions = vectors_metadata_from_file(vector_bin_path)
if filter_labels is not None:
_assert(
len(filter_labels) == num_points,
"filter_labels must be the same length as the number of points"
)
if vector_dtype_actual == np.uint8:
_builder = _native_dap.build_memory_uint8_index
elif vector_dtype_actual == np.int8:
_builder = _native_dap.build_memory_int8_index
else:
_builder = _native_dap.build_memory_float_index
index_prefix_path = os.path.join(index_directory, index_prefix)
filter_labels_file = ""
if filter_labels is not None:
label_counts = {}
filter_labels_file = f"{index_prefix_path}_pylabels.txt"
with open(filter_labels_file, "w") as labels_file:
for labels in filter_labels:
for label in labels:
label_counts[label] = 1 if label not in label_counts else label_counts[label] + 1
if len(labels) == 0:
print("default", file=labels_file)
else:
print(",".join(labels), file=labels_file)
with open(f"{index_prefix_path}_label_metadata.json", "w") as label_metadata_file:
json.dump(label_counts, label_metadata_file, indent=True)
if isinstance(tags, str) and tags != "":
use_tags = True
shutil.copy(tags, index_prefix_path + ".tags")
elif not isinstance(tags, str):
use_tags = True
tags_as_array = _castable_dtype_or_raise(tags, expected=np.uint32)
_assert(len(tags_as_array.shape) == 1, "Provided tags must be 1 dimensional")
_assert(
tags_as_array.shape[0] == num_points,
"Provided tags must contain an identical population to the number of points, "
f"{tags_as_array.shape[0]=}, {num_points=}",
)
tags_to_file(index_prefix_path + ".tags", tags_as_array)
else:
use_tags = False
_builder(
distance_metric=dap_metric,
data_file_path=vector_bin_path,
index_output_path=index_prefix_path,
complexity=complexity,
graph_degree=graph_degree,
alpha=alpha,
num_threads=num_threads,
use_pq_build=use_pq_build,
num_pq_bytes=num_pq_bytes,
use_opq=use_opq,
use_tags=use_tags,
filter_labels_file=filter_labels_file,
universal_label=universal_label,
filter_complexity=filter_complexity,
)
_write_index_metadata(
index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions
)

View File

@@ -0,0 +1,74 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from typing import BinaryIO, Optional, overload
import numpy as np
from . import DistanceMetric, VectorDType, VectorIdentifierBatch, VectorLikeBatch
def numpy_to_diskann_file(vectors: np.ndarray, file_handler: BinaryIO): ...
@overload
def build_disk_index(
data: str,
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
search_memory_maximum: float,
build_memory_maximum: float,
num_threads: int,
pq_disk_bytes: int,
vector_dtype: VectorDType,
index_prefix: str,
) -> None: ...
@overload
def build_disk_index(
data: VectorLikeBatch,
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
search_memory_maximum: float,
build_memory_maximum: float,
num_threads: int,
pq_disk_bytes: int,
index_prefix: str,
) -> None: ...
@overload
def build_memory_index(
data: VectorLikeBatch,
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
alpha: float,
num_threads: int,
use_pq_build: bool,
num_pq_bytes: int,
use_opq: bool,
tags: Union[str, VectorIdentifierBatch],
filter_labels: Optional[list[list[str]]],
universal_label: str,
filter_complexity: int,
index_prefix: str
) -> None: ...
@overload
def build_memory_index(
data: str,
distance_metric: DistanceMetric,
index_directory: str,
complexity: int,
graph_degree: int,
alpha: float,
num_threads: int,
use_pq_build: bool,
num_pq_bytes: int,
use_opq: bool,
vector_dtype: VectorDType,
tags: Union[str, VectorIdentifierBatch],
filter_labels_file: Optional[list[list[str]]],
universal_label: str,
filter_complexity: int,
index_prefix: str
) -> None: ...

View File

@@ -0,0 +1,251 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Literal, NamedTuple, Optional, Tuple, Type, Union
import numpy as np
from . import (
DistanceMetric,
VectorDType,
VectorIdentifierBatch,
VectorLike,
VectorLikeBatch,
)
from . import _diskannpy as _native_dap
__ALL__ = ["valid_dtype"]
_VALID_DTYPES = [np.float32, np.int8, np.uint8]
def valid_dtype(dtype: Type) -> VectorDType:
"""
Utility method to determine whether the provided dtype is supported by `diskannpy`, and if so, the canonical
dtype we will use internally (e.g. np.single -> np.float32)
"""
_assert_dtype(dtype)
if dtype == np.uint8:
return np.uint8
if dtype == np.int8:
return np.int8
if dtype == np.float32:
return np.float32
def _assert(statement_eval: bool, message: str):
if not statement_eval:
raise ValueError(message)
def _valid_metric(metric: str) -> _native_dap.Metric:
if not isinstance(metric, str):
raise ValueError("distance_metric must be a string")
if metric.lower() == "l2":
return _native_dap.L2
elif metric.lower() == "mips":
return _native_dap.INNER_PRODUCT
elif metric.lower() == "cosine":
return _native_dap.COSINE
else:
raise ValueError("distance_metric must be one of 'l2', 'mips', or 'cosine'")
def _assert_dtype(dtype: Type):
_assert(
any(np.can_cast(dtype, _dtype) for _dtype in _VALID_DTYPES),
f"Vector dtype must be of one of type {{(np.single, np.float32), (np.byte, np.int8), (np.ubyte, np.uint8)}}",
)
def _castable_dtype_or_raise(
data: Union[VectorLike, VectorLikeBatch, VectorIdentifierBatch], expected: np.dtype
) -> np.ndarray:
if isinstance(data, np.ndarray) and np.can_cast(data.dtype, expected):
return data.astype(expected, casting="safe")
else:
raise TypeError(
f"expecting a numpy ndarray of dtype {expected}, not a {type(data)}"
)
def _assert_2d(vectors: np.ndarray, name: str):
_assert(len(vectors.shape) == 2, f"{name} must be 2d numpy array")
__MAX_UINT32_VAL = 4_294_967_295
def _assert_is_positive_uint32(test_value: int, parameter: str):
_assert(
test_value is not None and 0 < test_value < __MAX_UINT32_VAL,
f"{parameter} must be a positive integer in the uint32 range",
)
def _assert_is_nonnegative_uint32(test_value: int, parameter: str):
_assert(
test_value is not None and -1 < test_value < __MAX_UINT32_VAL,
f"{parameter} must be a non-negative integer in the uint32 range",
)
def _assert_is_nonnegative_uint64(test_value: int, parameter: str):
_assert(
-1 < test_value,
f"{parameter} must be a non-negative integer in the uint64 range",
)
def _assert_existing_directory(path: str, parameter: str):
_path = Path(path)
_assert(
_path.exists() and _path.is_dir(), f"{parameter} must be an existing directory"
)
def _assert_existing_file(path: str, parameter: str):
_path = Path(path)
_assert(_path.exists() and _path.is_file(), f"{parameter} must be an existing file")
class _DataType(Enum):
FLOAT32 = 0
INT8 = 1
UINT8 = 2
@classmethod
def from_type(cls, vector_dtype: VectorDType) -> "DataType":
if vector_dtype == np.float32:
return cls.FLOAT32
if vector_dtype == np.int8:
return cls.INT8
if vector_dtype == np.uint8:
return cls.UINT8
def to_type(self) -> VectorDType:
if self is _DataType.FLOAT32:
return np.float32
if self is _DataType.INT8:
return np.int8
if self is _DataType.UINT8:
return np.uint8
class _Metric(Enum):
L2 = 0
MIPS = 1
COSINE = 2
@classmethod
def from_native(cls, metric: _native_dap.Metric) -> "_Metric":
if metric == _native_dap.L2:
return cls.L2
if metric == _native_dap.INNER_PRODUCT:
return cls.MIPS
if metric == _native_dap.COSINE:
return cls.COSINE
def to_native(self) -> _native_dap.Metric:
if self is _Metric.L2:
return _native_dap.L2
if self is _Metric.MIPS:
return _native_dap.INNER_PRODUCT
if self is _Metric.COSINE:
return _native_dap.COSINE
def to_str(self) -> _native_dap.Metric:
if self is _Metric.L2:
return "l2"
if self is _Metric.MIPS:
return "mips"
if self is _Metric.COSINE:
return "cosine"
def _build_metadata_path(index_path_and_prefix: str) -> str:
return index_path_and_prefix + "_metadata.bin"
def _write_index_metadata(
index_path_and_prefix: str,
dtype: VectorDType,
metric: _native_dap.Metric,
num_points: int,
dimensions: int,
):
np.array(
[
_DataType.from_type(dtype).value,
_Metric.from_native(metric).value,
num_points,
dimensions,
],
dtype=np.uint64,
).tofile(_build_metadata_path(index_path_and_prefix))
def _read_index_metadata(
index_path_and_prefix: str,
) -> Optional[Tuple[VectorDType, str, np.uint64, np.uint64]]:
path = _build_metadata_path(index_path_and_prefix)
if not Path(path).exists():
return None
else:
metadata = np.fromfile(path, dtype=np.uint64, count=-1)
return (
_DataType(int(metadata[0])).to_type(),
_Metric(int(metadata[1])).to_str(),
metadata[2],
metadata[3],
)
def _ensure_index_metadata(
index_path_and_prefix: str,
vector_dtype: Optional[VectorDType],
distance_metric: Optional[DistanceMetric],
max_vectors: int,
dimensions: Optional[int],
warn_size_exceeded: bool = False,
) -> Tuple[VectorDType, str, np.uint64, np.uint64]:
possible_metadata = _read_index_metadata(index_path_and_prefix)
if possible_metadata is None:
_assert(
all([vector_dtype, distance_metric, dimensions]),
"distance_metric, vector_dtype, and dimensions must provided if a corresponding metadata file has not "
"been built for this index, such as when an index was built via the CLI tools or prior to the addition "
"of a metadata file",
)
_assert_dtype(vector_dtype)
_assert_is_positive_uint32(max_vectors, "max_vectors")
_assert_is_positive_uint32(dimensions, "dimensions")
return vector_dtype, distance_metric, max_vectors, dimensions # type: ignore
else:
vector_dtype, distance_metric, num_vectors, dimensions = possible_metadata
if warn_size_exceeded:
if max_vectors is not None and num_vectors > max_vectors:
warnings.warn(
"The number of vectors in the saved index exceeds the max_vectors parameter. "
"max_vectors is being adjusted to accommodate the dataset, but any insertions will fail."
)
max_vectors = num_vectors
if num_vectors == max_vectors:
warnings.warn(
"The number of vectors in the saved index equals max_vectors parameter. Any insertions will fail."
)
return possible_metadata
def _valid_index_prefix(index_directory: str, index_prefix: str) -> str:
_assert(
index_directory is not None and index_directory != "",
"index_directory cannot be None or empty",
)
_assert_existing_directory(index_directory, "index_directory")
_assert(index_prefix != "", "index_prefix cannot be an empty string")
return os.path.join(index_directory, index_prefix)

View File

@@ -0,0 +1,511 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import os
import warnings
from pathlib import Path
from typing import Optional
import numpy as np
from . import (
DistanceMetric,
QueryResponse,
QueryResponseBatch,
VectorDType,
VectorIdentifier,
VectorIdentifierBatch,
VectorLike,
VectorLikeBatch,
)
from . import _diskannpy as _native_dap
from ._common import (
_assert,
_assert_2d,
_assert_dtype,
_assert_existing_directory,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_ensure_index_metadata,
_valid_index_prefix,
_valid_metric,
_write_index_metadata,
)
from ._diskannpy import defaults
__ALL__ = ["DynamicMemoryIndex"]
class DynamicMemoryIndex:
"""
A DynamicMemoryIndex instance is used to both search and mutate a `diskannpy` memory index. This index is unlike
either `diskannpy.StaticMemoryIndex` or `diskannpy.StaticDiskIndex` in the following ways:
- It requires an explicit vector identifier for each vector added to it.
- Insert and (lazy) deletion operations are provided for a flexible, living index
The mutable aspect of this index will absolutely impact search time performance as new vectors are added and
old deleted. `DynamicMemoryIndex.consolidate_deletes()` should be called periodically to restructure the index
to remove deleted vectors and improve per-search performance, at the cost of an expensive index consolidation to
occur.
"""
@classmethod
def from_file(
cls,
index_directory: str,
max_vectors: int,
complexity: int,
graph_degree: int,
saturate_graph: bool = defaults.SATURATE_GRAPH,
max_occlusion_size: int = defaults.MAX_OCCLUSION_SIZE,
alpha: float = defaults.ALPHA,
num_threads: int = defaults.NUM_THREADS,
filter_complexity: int = defaults.FILTER_COMPLEXITY,
num_frozen_points: int = defaults.NUM_FROZEN_POINTS_DYNAMIC,
initial_search_complexity: int = 0,
search_threads: int = 0,
concurrent_consolidation: bool = True,
index_prefix: str = "ann",
distance_metric: Optional[DistanceMetric] = None,
vector_dtype: Optional[VectorDType] = None,
dimensions: Optional[int] = None,
) -> "DynamicMemoryIndex":
"""
The `from_file` classmethod is used to load a previously saved index from disk. This index *must* have been
created with a valid `tags` file or `tags` np.ndarray of `diskannpy.VectorIdentifier`s. It is *strongly*
recommended that you use the same parameters as the `diskannpy.build_memory_index()` function that created
the index.
### Parameters
- **index_directory**: The directory containing the index files. This directory must contain the following
files:
- `{index_prefix}.data`
- `{index_prefix}.tags`
- `{index_prefix}`
It may also include the following optional files:
- `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
`index_directory` if the index was created from a numpy array
- `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
If an index is built from the `diskann` cli tools, this file will not exist.
- **max_vectors**: Capacity of the memory index including space for future insertions.
- **complexity**: Complexity (a.k.a `L`) references the size of the list we store candidate approximate
neighbors in. It's used during save (which is an index rebuild), and it's used as an initial search size to
warm up our index and lower the latency for initial real searches.
- **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph
structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond
this value. Higher R values require longer index build times, but may result in an index showing excellent
recall and latency characteristics.
- **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly
`graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors.
- **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function.
- **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably
more distance comparisons compared to a lower alpha value.
- **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available
logical processors.
- **filter_complexity**: Complexity to use when using filters. Default is 0.
- **num_frozen_points**: Number of points to freeze. Default is 1.
- **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
`initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
operation requests a space larger than can be accommodated by these values.
- **search_threads**: Should be set to the most common `num_threads` expected to be used during the
life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
`initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search`
operation requests a space larger than can be accommodated by these values.
- **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and
deletes, or whether the index is locked down to changes while consolidation is ongoing.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
you are required to provide it.
- **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
`{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
- **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
does not exist, you are required to provide it.
### Returns
A `diskannpy.DynamicMemoryIndex` object, with the index loaded from disk and ready to use for insertions,
deletions, and searches.
"""
index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
# do tags exist?
tags_file = index_prefix_path + ".tags"
_assert(
Path(tags_file).exists(),
f"The file {tags_file} does not exist in {index_directory}",
)
vector_dtype, dap_metric, num_vectors, dimensions = _ensure_index_metadata(
index_prefix_path, vector_dtype, distance_metric, max_vectors, dimensions, warn_size_exceeded=True
)
index = cls(
distance_metric=dap_metric, # type: ignore
vector_dtype=vector_dtype,
dimensions=dimensions,
max_vectors=max_vectors,
complexity=complexity,
graph_degree=graph_degree,
saturate_graph=saturate_graph,
max_occlusion_size=max_occlusion_size,
alpha=alpha,
num_threads=num_threads,
filter_complexity=filter_complexity,
num_frozen_points=num_frozen_points,
initial_search_complexity=initial_search_complexity,
search_threads=search_threads,
concurrent_consolidation=concurrent_consolidation,
)
index._index.load(index_prefix_path)
index._num_vectors = num_vectors # current number of vectors loaded
return index
def __init__(
self,
distance_metric: DistanceMetric,
vector_dtype: VectorDType,
dimensions: int,
max_vectors: int,
complexity: int,
graph_degree: int,
saturate_graph: bool = defaults.SATURATE_GRAPH,
max_occlusion_size: int = defaults.MAX_OCCLUSION_SIZE,
alpha: float = defaults.ALPHA,
num_threads: int = defaults.NUM_THREADS,
filter_complexity: int = defaults.FILTER_COMPLEXITY,
num_frozen_points: int = defaults.NUM_FROZEN_POINTS_DYNAMIC,
initial_search_complexity: int = 0,
search_threads: int = 0,
concurrent_consolidation: bool = True,
):
"""
The `diskannpy.DynamicMemoryIndex` represents our python API into a mutable DiskANN memory index.
This constructor is used to create a new, empty index. If you wish to load a previously saved index from disk,
please use the `diskannpy.DynamicMemoryIndex.from_file` classmethod instead.
### Parameters
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats.
- **vector_dtype**: One of {`np.float32`, `np.int8`, `np.uint8`}. The dtype of the vectors this index will
be storing.
- **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
dimensionality.
- **max_vectors**: Capacity of the data store including space for future insertions
- **graph_degree**: Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph
structure. This degree will be pruned throughout the course of the index build, but it will never grow beyond
this value. Higher `graph_degree` values require longer index build times, but may result in an index showing
excellent recall and latency characteristics.
- **saturate_graph**: If True, the adjacency list of each node will be saturated with neighbors to have exactly
`graph_degree` neighbors. If False, each node will have between 1 and `graph_degree` neighbors.
- **max_occlusion_size**: The maximum number of points that can be considered by occlude_list function.
- **alpha**: The alpha parameter (>=1) is used to control the nature and number of points that are added to the
graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) to convergence, but probably
more distance comparisons compared to a lower alpha value.
- **num_threads**: Number of threads to use when creating this index. `0` indicates we should use all available
logical processors.
- **filter_complexity**: Complexity to use when using filters. Default is 0.
- **num_frozen_points**: Number of points to freeze. Default is 1.
- **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
`initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
operation requests a space larger than can be accommodated by these values.
- **search_threads**: Should be set to the most common `num_threads` expected to be used during the
life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
`initial_search_complexity` * `search_threads`. Note that it may be resized if a `batch_search`
operation requests a space larger than can be accommodated by these values.
- **concurrent_consolidation**: This flag dictates whether consolidation can be run alongside inserts and
deletes, or whether the index is locked down to changes while consolidation is ongoing.
"""
self._num_vectors = 0
self._removed_num_vectors = 0
dap_metric = _valid_metric(distance_metric)
self._dap_metric = dap_metric
_assert_dtype(vector_dtype)
_assert_is_positive_uint32(dimensions, "dimensions")
self._vector_dtype = vector_dtype
self._dimensions = dimensions
_assert_is_positive_uint32(max_vectors, "max_vectors")
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_positive_uint32(graph_degree, "graph_degree")
_assert(
alpha >= 1,
"alpha must be >= 1, and realistically should be kept between [1.0, 2.0)",
)
_assert_is_nonnegative_uint32(max_occlusion_size, "max_occlusion_size")
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_nonnegative_uint32(filter_complexity, "filter_complexity")
_assert_is_nonnegative_uint32(num_frozen_points, "num_frozen_points")
_assert_is_nonnegative_uint32(
initial_search_complexity, "initial_search_complexity"
)
_assert_is_nonnegative_uint32(search_threads, "search_threads")
self._max_vectors = max_vectors
self._complexity = complexity
self._graph_degree = graph_degree
if vector_dtype == np.uint8:
_index = _native_dap.DynamicMemoryUInt8Index
elif vector_dtype == np.int8:
_index = _native_dap.DynamicMemoryInt8Index
else:
_index = _native_dap.DynamicMemoryFloatIndex
self._index = _index(
distance_metric=dap_metric,
dimensions=dimensions,
max_vectors=max_vectors,
complexity=complexity,
graph_degree=graph_degree,
saturate_graph=saturate_graph,
max_occlusion_size=max_occlusion_size,
alpha=alpha,
num_threads=num_threads,
filter_complexity=filter_complexity,
num_frozen_points=num_frozen_points,
initial_search_complexity=initial_search_complexity,
search_threads=search_threads,
concurrent_consolidation=concurrent_consolidation,
)
self._points_deleted = False
def search(
self, query: VectorLike, k_neighbors: int, complexity: int
) -> QueryResponse:
"""
Searches the index by a single query vector.
### Parameters
- **query**: 1d numpy array of the same dimensionality and dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
"""
_query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
_assert(len(_query.shape) == 1, "query vector must be 1-d")
_assert(
_query.shape[0] == self._dimensions,
f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
f"query dimensionality: {_query.shape[0]}",
)
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_nonnegative_uint32(complexity, "complexity")
if k_neighbors > complexity:
warnings.warn(
f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
neighbors, distances = self._index.search(query=_query, knn=k_neighbors, complexity=complexity)
return QueryResponse(identifiers=neighbors, distances=distances)
def batch_search(
self,
queries: VectorLikeBatch,
k_neighbors: int,
complexity: int,
num_threads: int,
) -> QueryResponseBatch:
"""
Searches the index by a batch of query vectors.
This search is parallelized and far more efficient than searching for each vector individually.
### Parameters
- **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
number of queries intended to search for in parallel. Dtype must match dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
- **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
"""
_queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
_assert_2d(_queries, "queries")
_assert(
_queries.shape[1] == self._dimensions,
f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
f"query dimensionality: {_queries.shape[1]}",
)
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_nonnegative_uint32(num_threads, "num_threads")
if k_neighbors > complexity:
warnings.warn(
f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
num_queries, dim = queries.shape
neighbors, distances = self._index.batch_search(
queries=_queries,
num_queries=num_queries,
knn=k_neighbors,
complexity=complexity,
num_threads=num_threads,
)
return QueryResponseBatch(identifiers=neighbors, distances=distances)
def save(self, save_path: str, index_prefix: str = "ann"):
"""
Saves this index to file.
### Parameters
- **save_path**: The path to save these index files to.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
"""
if save_path == "":
raise ValueError("save_path cannot be empty")
if index_prefix == "":
raise ValueError("index_prefix cannot be empty")
index_prefix = index_prefix.format(complexity=self._complexity, graph_degree=self._graph_degree)
_assert_existing_directory(save_path, "save_path")
save_path = os.path.join(save_path, index_prefix)
if self._points_deleted is True:
warnings.warn(
"DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called "
"prior to save when items have been marked for deletion. This is being done automatically now, though"
"it will increase the time it takes to save; on large sets of data it can take a substantial amount of "
"time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is "
"required."
)
self._index.consolidate_delete()
self._index.save(
save_path=save_path, compact_before_save=True
) # we do not yet support uncompacted saves
_write_index_metadata(
save_path,
self._vector_dtype,
self._dap_metric,
self._index.num_points(),
self._dimensions,
)
def insert(self, vector: VectorLike, vector_id: VectorIdentifier):
"""
Inserts a single vector into the index with the provided vector_id.
If this insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()` will
be executed automatically.
### Parameters
- **vector**: The vector to insert. Note that dtype must match.
- **vector_id**: The vector_id to use for this vector.
"""
_vector = _castable_dtype_or_raise(vector, expected=self._vector_dtype)
_assert(len(vector.shape) == 1, "insert vector must be 1-d")
_assert_is_positive_uint32(vector_id, "vector_id")
if self._num_vectors + 1 > self._max_vectors:
if self._removed_num_vectors > 0:
warnings.warn(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified at index "
f"construction. We are attempting to consolidate_delete() to make space.")
self.consolidate_delete()
else:
raise RuntimeError(f"Inserting this vector would overrun the max_vectors={self._max_vectors} specified "
f"at index construction. Unable to make space by consolidating deletions. The insert"
f"operation has failed.")
status = self._index.insert(_vector, np.uint32(vector_id))
if status == 0:
self._num_vectors += 1
else:
raise RuntimeError(
f"Insert was unable to complete successfully; error code returned from diskann C++ lib: {status}"
)
def batch_insert(
self,
vectors: VectorLikeBatch,
vector_ids: VectorIdentifierBatch,
num_threads: int = 0,
):
"""
Inserts a batch of vectors into the index with the provided vector_ids.
If this batch insertion will overrun the `max_vectors` count boundaries of this index, `consolidate_delete()`
will be executed automatically.
### Parameters
- **vectors**: The 2d numpy array of vectors to insert.
- **vector_ids**: The 1d array of vector ids to use. This array must have the same number of elements as
the vectors array has rows. The dtype of vector_ids must be `np.uint32`
- **num_threads**: Number of threads to use when inserting into this index. (>= 0), 0 = num_threads in system
"""
_query = _castable_dtype_or_raise(vectors, expected=self._vector_dtype)
_assert(len(vectors.shape) == 2, "vectors must be a 2-d array")
_assert(
vectors.shape[0] == vector_ids.shape[0],
"Number of vectors must be equal to number of ids",
)
_vectors = vectors.astype(dtype=self._vector_dtype, casting="safe", copy=False)
_vector_ids = vector_ids.astype(dtype=np.uint32, casting="safe", copy=False)
if self._num_vectors + _vector_ids.shape[0] > self._max_vectors:
if self._max_vectors + self._removed_num_vectors >= _vector_ids.shape[0]:
warnings.warn(f"Inserting these vectors, count={_vector_ids.shape[0]} would overrun the "
f"max_vectors={self._max_vectors} specified at index construction. We are attempting to "
f"consolidate_delete() to make space.")
self.consolidate_delete()
else:
raise RuntimeError(f"Inserting these vectors count={_vector_ids.shape[0]} would overrun the "
f"max_vectors={self._max_vectors} specified at index construction. Unable to make "
f"space by consolidating deletions. The batch insert operation has failed.")
statuses = self._index.batch_insert(
_vectors, _vector_ids, _vector_ids.shape[0], num_threads
)
successes = []
failures = []
for i in range(0, len(statuses)):
if statuses[i] == 0:
successes.append(i)
else:
failures.append(i)
self._num_vectors += len(successes)
if len(failures) == 0:
return
failed_ids = vector_ids[failures]
raise RuntimeError(
f"During batch insert, the following vector_ids were unable to be inserted into the index: {failed_ids}. "
f"{len(successes)} were successfully inserted"
)
def mark_deleted(self, vector_id: VectorIdentifier):
"""
Mark vector for deletion. This is a soft delete that won't return the vector id in any results, but does not
remove it from the underlying index files or memory structure. To execute a hard delete, call this method and
then call the much more expensive `consolidate_delete` method on this index.
### Parameters
- **vector_id**: The vector id to delete. Must be a uint32.
"""
_assert_is_positive_uint32(vector_id, "vector_id")
self._points_deleted = True
self._removed_num_vectors += 1
# we do not decrement self._num_vectors until consolidate_delete
self._index.mark_deleted(np.uint32(vector_id))
def consolidate_delete(self):
"""
This method actually restructures the DiskANN index to remove the items that have been marked for deletion.
"""
self._index.consolidate_delete()
self._points_deleted = False
self._num_vectors -= self._removed_num_vectors
self._removed_num_vectors = 0

View File

@@ -0,0 +1,122 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import warnings
from typing import BinaryIO, Literal, NamedTuple
import numpy as np
import numpy.typing as npt
from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch
from ._common import _assert, _assert_2d, _assert_dtype, _assert_existing_file
class Metadata(NamedTuple):
"""DiskANN binary vector files contain a small stanza containing some metadata about them."""
num_vectors: int
""" The number of vectors in the file. """
dimensions: int
""" The dimensionality of the vectors in the file. """
def vectors_metadata_from_file(vector_file: str) -> Metadata:
"""
Read the metadata from a DiskANN binary vector file.
### Parameters
- **vector_file**: The path to the vector file to read the metadata from.
### Returns
`diskannpy.Metadata`
"""
_assert_existing_file(vector_file, "vector_file")
points, dims = np.fromfile(file=vector_file, dtype=np.int32, count=2)
return Metadata(points, dims)
def _write_bin(data: np.ndarray, file_handler: BinaryIO):
if len(data.shape) == 1:
_ = file_handler.write(np.array([data.shape[0], 1], dtype=np.int32).tobytes())
else:
_ = file_handler.write(np.array(data.shape, dtype=np.int32).tobytes())
_ = file_handler.write(data.tobytes())
def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None:
"""
Utility function that writes a DiskANN binary vector formatted file to the location of your choosing.
### Parameters
- **vector_file**: The path to the vector file to write the vectors to.
- **vectors**: A 2d array of dtype `numpy.float32`, `numpy.uint8`, or `numpy.int8`
"""
_assert_dtype(vectors.dtype)
_assert_2d(vectors, "vectors")
with open(vector_file, "wb") as fh:
_write_bin(vectors, fh)
def vectors_from_file(
vector_file: str,
dtype: VectorDType,
use_memmap: bool = False,
mode: Literal["r", "r+"] = "r"
) -> npt.NDArray[VectorDType]:
"""
Read vectors from a DiskANN binary vector file.
### Parameters
- **vector_file**: The path to the vector file to read the vectors from.
- **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly
- **use_memmap**: If True, return a np.memmap, else a standard np.ndarray will be returned
- **mode**: Read-only (r) or read-write (r+) (memmap only). Unlike np.memmap, default is read-only (r)
### Returns
`numpy.typing.NDArray[dtype] | numpy.memmap`
"""
assert mode in ["r", "r+"]
points, dims = vectors_metadata_from_file(vector_file)
if not use_memmap:
return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims)
else:
return np.memmap(vector_file, dtype=dtype, mode=mode, offset=8, shape=(points, dims), order='C')
def tags_to_file(tags_file: str, tags: VectorIdentifierBatch) -> None:
"""
Write tags to a DiskANN binary tag file.
### Parameters
- **tags_file**: The path to the tag file to write the tags to.
- **tags**: A 1d array of dtype `numpy.uint32` containing the tags to write. If you have a 2d array of tags with
one column, you can pass it here and it will be reshaped and copied to a new array. It is more efficient for you
to reshape on your own without copying it first, as it should be a constant time operation vs. linear time
"""
_assert(np.can_cast(tags.dtype, np.uint32), "valid tags must be uint32")
_assert(
len(tags.shape) == 1 or tags.shape[1] == 1,
"tags must be 1d or 2d with 1 column",
)
if len(tags.shape) == 2:
warnings.warn(
"Tags in 2d with one column will be reshaped and copied to a new array. "
"It is more efficient for you to reshape without copying first."
)
tags = tags.reshape(tags.shape[0], copy=True)
with open(tags_file, "wb") as fh:
_write_bin(tags.astype(np.uint32), fh)
def tags_from_file(tags_file: str) -> VectorIdentifierBatch:
"""
Read tags from a DiskANN binary tag file and return them as a 1d array of dtype `numpy.uint32`.
### Parameters
- **tags_file**: The path to the tag file to read the tags from.
"""
_assert_existing_file(tags_file, "tags_file")
points, dims = vectors_metadata_from_file(
tags_file
) # tag files contain the same metadata stanza
return np.fromfile(file=tags_file, dtype=np.uint32, offset=8).reshape(points)

View File

@@ -0,0 +1,244 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import os
import warnings
from typing import Optional
import numpy as np
from . import (
DistanceMetric,
QueryResponse,
QueryResponseBatch,
VectorDType,
VectorLike,
VectorLikeBatch,
)
from . import _diskannpy as _native_dap
from ._common import (
_assert,
_assert_2d,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_ensure_index_metadata,
_valid_index_prefix,
_valid_metric,
)
__ALL__ = ["StaticDiskIndex"]
class StaticDiskIndex:
"""
A StaticDiskIndex is a disk-backed index that is not mutable.
"""
def __init__(
self,
index_directory: str,
num_threads: int,
num_nodes_to_cache: int,
cache_mechanism: int = 1,
distance_metric: Optional[DistanceMetric] = None,
vector_dtype: Optional[VectorDType] = None,
dimensions: Optional[int] = None,
index_prefix: str = "ann",
pq_prefix: str = "",
partition_prefix: str = "",
):
"""
### Parameters
- **index_directory**: The directory containing the index files. This directory must contain the following
files:
- `{index_prefix}_sample_data.bin`
- `{index_prefix}_mem.index.data`
- `{index_prefix}_pq_compressed.bin`
- `{index_prefix}_pq_pivots.bin`
- `{index_prefix}_sample_ids.bin`
- `{index_prefix}_disk.index`
It may also include the following optional files:
- `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
`index_directory` if the index was created from a numpy array
- `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
If an index is built from the `diskann` cli tools, this file will not exist.
- **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
- **num_nodes_to_cache**: Number of nodes to cache in memory (> -1)
- **cache_mechanism**: 1 -> use the generated sample_data.bin file for
the index to initialize a set of cached nodes, up to `num_nodes_to_cache`, 2 -> ready the cache for up to
`num_nodes_to_cache`, but do not initialize it with any nodes. Any other value disables node caching.
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
you are required to provide it.
- **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
`{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
- **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
does not exist, you are required to provide it.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
"""
index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
vector_dtype, metric, _, _ = _ensure_index_metadata(
index_prefix_path,
vector_dtype,
distance_metric,
1, # it doesn't matter because we don't need it in this context anyway
dimensions,
)
dap_metric = _valid_metric(metric)
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_nonnegative_uint32(num_nodes_to_cache, "num_nodes_to_cache")
self._vector_dtype = vector_dtype
if vector_dtype == np.uint8:
_index = _native_dap.StaticDiskUInt8Index
elif vector_dtype == np.int8:
_index = _native_dap.StaticDiskInt8Index
else:
_index = _native_dap.StaticDiskFloatIndex
self._index = _index(
distance_metric=dap_metric,
index_path_prefix=index_prefix_path,
num_threads=num_threads,
num_nodes_to_cache=num_nodes_to_cache,
cache_mechanism=cache_mechanism,
pq_prefix=pq_prefix,
partition_prefix=partition_prefix,
)
print("After index init")
def search(
self,
query: VectorLike,
k_neighbors: int,
complexity: int,
beam_width: int = 2,
USE_DEFERRED_FETCH: bool = False,
skip_search_reorder: bool = False,
recompute_beighbor_embeddings: bool = False,
dedup_node_dis: bool = False,
prune_ratio: float = 0,
batch_recompute: bool = False,
global_pruning: bool = False,
) -> QueryResponse:
"""
Searches the index by a single query vector.
### Parameters
- **query**: 1d numpy array of the same dimensionality and dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
- **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query
will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query,
but might result in slightly higher total number of IO requests to SSD per query. For the highest query
throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search.
Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will
involve some tuning overhead.
- **skip_search_reorder**: Whether to skip search reorder for diskann search.
- **recompute_beighbor_embeddings**: Whether to recompute the neighbor embeddings.
- **dedup_node_dis**: Whether to dedup node distances.
- **batch_recompute**: Whether to batch recompute.
"""
_query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
_assert(len(_query.shape) == 1, "query vector must be 1-d")
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_positive_uint32(beam_width, "beam_width")
if k_neighbors > complexity:
warnings.warn(
f"{k_neighbors=} asked for, but {complexity=} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
neighbors, distances = self._index.search(
query=_query,
knn=k_neighbors,
complexity=complexity,
beam_width=beam_width,
USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
skip_search_reorder=skip_search_reorder,
recompute_beighbor_embeddings=recompute_beighbor_embeddings,
dedup_node_dis=dedup_node_dis,
prune_ratio=prune_ratio,
batch_recompute=batch_recompute,
global_pruning=global_pruning,
)
return QueryResponse(identifiers=neighbors, distances=distances)
def batch_search(
self,
queries: VectorLikeBatch,
k_neighbors: int,
complexity: int,
num_threads: int,
beam_width: int = 2,
USE_DEFERRED_FETCH: bool = False,
skip_search_reorder: bool = False,
recompute_beighbor_embeddings: bool = False,
dedup_node_dis: bool = False,
prune_ratio: float = 0,
batch_recompute: bool = False,
global_pruning: bool = False,
) -> QueryResponseBatch:
"""
Searches the index by a batch of query vectors.
This search is parallelized and far more efficient than searching for each vector individually.
### Parameters
- **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
number of queries intended to search for in parallel. Dtype must match dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
- **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
- **beam_width**: The beamwidth to be used for search. This is the maximum number of IO requests each query
will issue per iteration of search code. Larger beamwidth will result in fewer IO round-trips per query,
but might result in slightly higher total number of IO requests to SSD per query. For the highest query
throughput with a fixed SSD IOps rating, use W=1. For best latency, use W=4,8 or higher complexity search.
Specifying 0 will optimize the beamwidth depending on the number of threads performing search, but will
involve some tuning overhead.
- **skip_search_reorder**: Whether to skip search reorder for diskann search.
"""
_queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
_assert_2d(_queries, "queries")
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_positive_uint32(beam_width, "beam_width")
if k_neighbors > complexity:
warnings.warn(
f"{k_neighbors=} asked for, but {complexity=} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
num_queries, dim = _queries.shape
print(
f"USE_DEFERRED_FETCH={USE_DEFERRED_FETCH} skip_search_reorder={skip_search_reorder} recompute_beighbor_embeddings={recompute_beighbor_embeddings}, dedup_node_dis={dedup_node_dis}"
)
neighbors, distances = self._index.batch_search(
queries=_queries,
num_queries=num_queries,
knn=k_neighbors,
complexity=complexity,
beam_width=beam_width,
num_threads=num_threads,
USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
skip_search_reorder=skip_search_reorder,
recompute_beighbor_embeddings=recompute_beighbor_embeddings,
dedup_node_dis=dedup_node_dis,
prune_ratio=prune_ratio,
batch_recompute=batch_recompute,
global_pruning=global_pruning,
)
return QueryResponseBatch(identifiers=neighbors, distances=distances)

View File

@@ -0,0 +1,262 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import json
import os
import warnings
from typing import Optional
import numpy as np
from . import (
DistanceMetric,
QueryResponse,
QueryResponseBatch,
VectorDType,
VectorLike,
VectorLikeBatch,
)
from . import _diskannpy as _native_dap
from ._common import (
_assert,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_ensure_index_metadata,
_valid_index_prefix,
_valid_metric,
)
__ALL__ = ["StaticMemoryIndex"]
class StaticMemoryIndex:
"""
A StaticMemoryIndex is an immutable in-memory DiskANN index.
"""
def __init__(
self,
index_directory: str,
num_threads: int,
initial_search_complexity: int,
index_prefix: str = "ann",
distance_metric: Optional[DistanceMetric] = None,
vector_dtype: Optional[VectorDType] = None,
dimensions: Optional[int] = None,
enable_filters: bool = False,
):
"""
### Parameters
- **index_directory**: The directory containing the index files. This directory must contain the following
files:
- `{index_prefix}.data`
- `{index_prefix}`
It may also include the following optional files:
- `{index_prefix}_vectors.bin`: Optional. `diskannpy` builder functions may create this file in the
`index_directory` if the index was created from a numpy array
- `{index_prefix}_metadata.bin`: Optional. `diskannpy` builder functions create this file to store metadata
about the index, such as vector dtype, distance metric, number of vectors and vector dimensionality.
If an index is built from the `diskann` cli tools, this file will not exist.
- **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
- **initial_search_complexity**: Should be set to the most common `complexity` expected to be used during the
life of this `diskannpy.DynamicMemoryIndex` object. The working scratch memory allocated is based off of
`initial_search_complexity` * `search_threads`. Note that it may be resized if a `search` or `batch_search`
operation requests a space larger than can be accommodated by these values.
- **index_prefix**: The prefix of the index files. Defaults to "ann".
- **distance_metric**: A `str`, strictly one of {"l2", "mips", "cosine"}. `l2` and `cosine` are supported for all 3
vector dtypes, but `mips` is only available for single precision floats. Default is `None`. **This
value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it does not exist,
you are required to provide it.
- **vector_dtype**: The vector dtype this index has been built with. **This value is only used if a
`{index_prefix}_metadata.bin` file does not exist.** If it does not exist, you are required to provide it.
- **dimensions**: The vector dimensionality of this index. All new vectors inserted must be the same
dimensionality. **This value is only used if a `{index_prefix}_metadata.bin` file does not exist.** If it
does not exist, you are required to provide it.
- **enable_filters**: Indexes built with filters can also be used for filtered search.
"""
index_prefix_path = _valid_index_prefix(index_directory, index_prefix)
self._labels_map = {}
self._labels_metadata = {}
if enable_filters:
try:
with open(f"{index_prefix_path}_labels_map.txt", "r") as labels_map_if:
for line in labels_map_if:
(key, val) = line.split("\t")
self._labels_map[key] = int(val)
with open(
f"{index_prefix_path}_label_metadata.json", "r"
) as labels_metadata_if:
self._labels_metadata = json.load(labels_metadata_if)
except: # noqa: E722
# exceptions are basically presumed to be either file not found or file not formatted correctly
raise RuntimeException("Filter labels file was unable to be processed.")
vector_dtype, metric, num_points, dims = _ensure_index_metadata(
index_prefix_path,
vector_dtype,
distance_metric,
1, # it doesn't matter because we don't need it in this context anyway
dimensions,
)
dap_metric = _valid_metric(metric)
_assert_is_nonnegative_uint32(num_threads, "num_threads")
_assert_is_positive_uint32(
initial_search_complexity, "initial_search_complexity"
)
self._vector_dtype = vector_dtype
self._dimensions = dims
if vector_dtype == np.uint8:
_index = _native_dap.StaticMemoryUInt8Index
elif vector_dtype == np.int8:
_index = _native_dap.StaticMemoryInt8Index
else:
_index = _native_dap.StaticMemoryFloatIndex
self._index = _index(
distance_metric=dap_metric,
num_points=num_points,
dimensions=dims,
index_path=index_prefix_path,
num_threads=num_threads,
initial_search_complexity=initial_search_complexity,
)
def search(
self,
query: VectorLike,
k_neighbors: int,
complexity: int,
filter_label: str = "",
USE_DEFERRED_FETCH: bool = False,
skip_search_reorder: bool = False,
recompute_beighbor_embeddings: bool = False,
dedup_node_dis: bool = False,
prune_ratio: float = 0,
batch_recompute: bool = False,
global_pruning: bool = False,
) -> QueryResponse:
"""
Searches the index by a single query vector.
### Parameters
- **query**: 1d numpy array of the same dimensionality and dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
"""
if filter_label != "":
if len(self._labels_map) == 0:
raise ValueError(
f"A filter label of {filter_label} was provided, but this class was not initialized with filters "
"enabled, e.g. StaticDiskMemory(..., enable_filters=True)"
)
if filter_label not in self._labels_map:
raise ValueError(
f"A filter label of {filter_label} was provided, but the external(str)->internal(np.uint32) labels map "
f"does not include that label."
)
k_neighbors = min(k_neighbors, self._labels_metadata[filter_label])
_query = _castable_dtype_or_raise(query, expected=self._vector_dtype)
_assert(len(_query.shape) == 1, "query vector must be 1-d")
_assert(
_query.shape[0] == self._dimensions,
f"query vector must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
f"query dimensionality: {_query.shape[0]}",
)
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_nonnegative_uint32(complexity, "complexity")
if k_neighbors > complexity:
warnings.warn(
f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
if filter_label == "":
neighbors, distances = self._index.search(
query=_query,
knn=k_neighbors,
complexity=complexity,
USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
skip_search_reorder=skip_search_reorder,
recompute_beighbor_embeddings=recompute_beighbor_embeddings,
dedup_node_dis=dedup_node_dis,
prune_ratio=prune_ratio,
batch_recompute=batch_recompute,
global_pruning=global_pruning,
)
else:
filter = self._labels_map[filter_label]
neighbors, distances = self._index.search_with_filter(
query=query, knn=k_neighbors, complexity=complexity, filter=filter
)
return QueryResponse(identifiers=neighbors, distances=distances)
def batch_search(
self,
queries: VectorLikeBatch,
k_neighbors: int,
complexity: int,
num_threads: int,
USE_DEFERRED_FETCH: bool = False,
skip_search_reorder: bool = False,
recompute_beighbor_embeddings: bool = False,
dedup_node_dis: bool = False,
prune_ratio: float = 0,
batch_recompute: bool = False,
global_pruning: bool = False,
) -> QueryResponseBatch:
"""
Searches the index by a batch of query vectors.
This search is parallelized and far more efficient than searching for each vector individually.
### Parameters
- **queries**: 2d numpy array, with column dimensionality matching the index and row dimensionality being the
number of queries intended to search for in parallel. Dtype must match dtype of the index.
- **k_neighbors**: Number of neighbors to be returned. If query vector exists in index, it almost definitely
will be returned as well, so adjust your ``k_neighbors`` as appropriate. Must be > 0.
- **complexity**: Size of distance ordered list of candidate neighbors to use while searching. List size
increases accuracy at the cost of latency. Must be at least k_neighbors in size.
- **num_threads**: Number of threads to use when searching this index. (>= 0), 0 = num_threads in system
"""
_queries = _castable_dtype_or_raise(queries, expected=self._vector_dtype)
_assert(len(_queries.shape) == 2, "queries must must be 2-d np array")
_assert(
_queries.shape[1] == self._dimensions,
f"query vectors must have the same dimensionality as the index; index dimensionality: {self._dimensions}, "
f"query dimensionality: {_queries.shape[1]}",
)
_assert_is_positive_uint32(k_neighbors, "k_neighbors")
_assert_is_positive_uint32(complexity, "complexity")
_assert_is_nonnegative_uint32(num_threads, "num_threads")
if k_neighbors > complexity:
warnings.warn(
f"k_neighbors={k_neighbors} asked for, but list_size={complexity} was smaller. Increasing {complexity} to {k_neighbors}"
)
complexity = k_neighbors
num_queries, dim = _queries.shape
neighbors, distances = self._index.batch_search(
queries=_queries,
num_queries=num_queries,
knn=k_neighbors,
complexity=complexity,
num_threads=num_threads,
USE_DEFERRED_FETCH=USE_DEFERRED_FETCH,
skip_search_reorder=skip_search_reorder,
recompute_beighbor_embeddings=recompute_beighbor_embeddings,
dedup_node_dis=dedup_node_dis,
prune_ratio=prune_ratio,
batch_recompute=batch_recompute,
global_pruning=global_pruning,
)
return QueryResponseBatch(identifiers=neighbors, distances=distances)

View File

@@ -0,0 +1,136 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include "builder.h"
#include "common.h"
#include "disk_utils.h"
#include "index.h"
#include "parameters.h"
namespace diskannpy
{
template <typename DT>
void build_disk_index(const diskann::Metric metric, const std::string &data_file_path,
const std::string &index_prefix_path, const uint32_t complexity, const uint32_t graph_degree,
const double final_index_ram_limit, const double indexing_ram_budget, const uint32_t num_threads,
const uint32_t pq_disk_bytes, const std::string &codebook_prefix)
{
std::string params = std::to_string(graph_degree) + " " + std::to_string(complexity) + " " +
std::to_string(final_index_ram_limit) + " " + std::to_string(indexing_ram_budget) + " " +
std::to_string(num_threads);
if (pq_disk_bytes > 0)
params = params + " " + std::to_string(pq_disk_bytes);
if (!codebook_prefix.empty())
params = params + " " + codebook_prefix;
diskann::build_disk_index<DT>(data_file_path.c_str(), index_prefix_path.c_str(), params.c_str(), metric, false,
codebook_prefix);
}
template void build_disk_index<float>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
double, double, uint32_t, uint32_t, const std::string &);
template void build_disk_index<uint8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
double, double, uint32_t, uint32_t, const std::string &);
template void build_disk_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
double, double, uint32_t, uint32_t, const std::string &);
template <typename T, typename TagT, typename LabelT>
std::string prepare_filtered_label_map(diskann::Index<T, TagT, LabelT> &index, const std::string &index_output_path,
const std::string &filter_labels_file, const std::string &universal_label)
{
std::string labels_file_to_use = index_output_path + "_label_formatted.txt";
std::string mem_labels_int_map_file = index_output_path + "_labels_map.txt";
convert_labels_string_to_int(filter_labels_file, labels_file_to_use, mem_labels_int_map_file, universal_label);
if (!universal_label.empty())
{
uint32_t unv_label_as_num = 0;
index.set_universal_label(unv_label_as_num);
}
return labels_file_to_use;
}
template std::string prepare_filtered_label_map<float>(diskann::Index<float, uint32_t, uint32_t> &, const std::string &,
const std::string &, const std::string &);
template std::string prepare_filtered_label_map<int8_t>(diskann::Index<int8_t, uint32_t, uint32_t> &,
const std::string &, const std::string &, const std::string &);
template std::string prepare_filtered_label_map<uint8_t>(diskann::Index<uint8_t, uint32_t, uint32_t> &,
const std::string &, const std::string &, const std::string &);
template <typename T, typename TagT, typename LabelT>
void build_memory_index(const diskann::Metric metric, const std::string &vector_bin_path,
const std::string &index_output_path, const uint32_t graph_degree, const uint32_t complexity,
const float alpha, const uint32_t num_threads, const bool use_pq_build,
const size_t num_pq_bytes, const bool use_opq, const bool use_tags,
const std::string &filter_labels_file, const std::string &universal_label,
const uint32_t filter_complexity)
{
diskann::IndexWriteParameters index_build_params = diskann::IndexWriteParametersBuilder(complexity, graph_degree)
.with_filter_list_size(filter_complexity)
.with_alpha(alpha)
.with_saturate_graph(false)
.with_num_threads(num_threads)
.build();
diskann::IndexSearchParams index_search_params =
diskann::IndexSearchParams(index_build_params.search_list_size, num_threads);
size_t data_num, data_dim;
diskann::get_bin_metadata(vector_bin_path, data_num, data_dim);
diskann::Index<T, TagT, LabelT> index(metric, data_dim, data_num,
std::make_shared<diskann::IndexWriteParameters>(index_build_params),
std::make_shared<diskann::IndexSearchParams>(index_search_params), 0,
use_tags, use_tags, false, use_pq_build, num_pq_bytes, use_opq);
if (use_tags)
{
const std::string tags_file = index_output_path + ".tags";
if (!file_exists(tags_file))
{
throw std::runtime_error("tags file not found at expected path: " + tags_file);
}
TagT *tags_data;
size_t tag_dims = 1;
diskann::load_bin(tags_file, tags_data, data_num, tag_dims);
std::vector<TagT> tags(tags_data, tags_data + data_num);
if (filter_labels_file.empty())
{
index.build(vector_bin_path.c_str(), data_num, tags);
}
else
{
auto labels_file = prepare_filtered_label_map<T, TagT, LabelT>(index, index_output_path, filter_labels_file,
universal_label);
index.build_filtered_index(vector_bin_path.c_str(), labels_file, data_num, tags);
}
}
else
{
if (filter_labels_file.empty())
{
index.build(vector_bin_path.c_str(), data_num);
}
else
{
auto labels_file = prepare_filtered_label_map<T, TagT, LabelT>(index, index_output_path, filter_labels_file,
universal_label);
index.build_filtered_index(vector_bin_path.c_str(), labels_file, data_num);
}
}
index.save(index_output_path.c_str());
}
template void build_memory_index<float>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, bool, const std::string &,
const std::string &, uint32_t);
template void build_memory_index<int8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, bool, const std::string &,
const std::string &, uint32_t);
template void build_memory_index<uint8_t>(diskann::Metric, const std::string &, const std::string &, uint32_t, uint32_t,
float, uint32_t, bool, size_t, bool, bool, const std::string &,
const std::string &, uint32_t);
} // namespace diskannpy

View File

@@ -0,0 +1,71 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
"""
# Parameter Defaults
These parameter defaults are re-exported from the C++ extension module, and used to keep the pythonic wrapper in sync with the C++.
"""
from ._diskannpy import defaults as _defaults
ALPHA = _defaults.ALPHA
"""
Note that, as ALPHA is a `float32` (single precision float) in C++, when converted into Python it becomes a
`float64` (double precision float). The actual value is 1.2f. The alpha parameter (>=1) is used to control the nature
and number of points that are added to the graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs)
to convergence, but probably more distance comparisons compared to a lower alpha value.
"""
NUM_THREADS = _defaults.NUM_THREADS
""" Number of threads to use. `0` will use all available detected logical processors """
MAX_OCCLUSION_SIZE = _defaults.MAX_OCCLUSION_SIZE
"""
The maximum number of points that can be occluded by a single point. This is used to prevent a single point from
dominating the graph structure. If a point has more than `max_occlusion_size` neighbors closer to it than the current
point, it will not be added to the graph. This is a tradeoff between index build time and search quality.
"""
FILTER_COMPLEXITY = _defaults.FILTER_COMPLEXITY
"""
Complexity (a.k.a. `L`) references the size of the list we store candidate approximate neighbors in while doing a
filtered search. This value must be larger than `k_neighbors`, and larger values tend toward higher recall in the
resultant ANN search at the cost of more time.
"""
NUM_FROZEN_POINTS_STATIC = _defaults.NUM_FROZEN_POINTS_STATIC
""" Number of points frozen by default in a StaticMemoryIndex """
NUM_FROZEN_POINTS_DYNAMIC = _defaults.NUM_FROZEN_POINTS_DYNAMIC
""" Number of points frozen by default in a DynamicMemoryIndex """
SATURATE_GRAPH = _defaults.SATURATE_GRAPH
""" Whether to saturate the graph or not. Default is `True` """
GRAPH_DEGREE = _defaults.GRAPH_DEGREE
"""
Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph structure. This degree will be
pruned throughout the course of the index build, but it will never grow beyond this value. Higher R values require
longer index build times, but may result in an index showing excellent recall and latency characteristics.
"""
COMPLEXITY = _defaults.COMPLEXITY
"""
Complexity (a.k.a `L`) references the size of the list we store candidate approximate neighbors in while doing build
or search tasks. It's used during index build as part of the index optimization processes. It's used in index search
classes both to help mitigate poor latencies during cold start, as well as on subsequent queries to conduct the search.
Large values will likely increase latency but also may improve recall, and tuning these values for your particular
index is certainly a reasonable choice.
"""
PQ_DISK_BYTES = _defaults.PQ_DISK_BYTES
"""
Use `0` to store uncompressed data on SSD. This allows the index to asymptote to 100% recall. If your vectors are
too large to store in SSD, this parameter provides the option to compress the vectors using PQ for storing on SSD.
This will trade off recall. You would also want this to be greater than the number of bytes used for the PQ
compressed data stored in-memory. Default is `0`.
"""
USE_PQ_BUILD = _defaults.USE_PQ_BUILD
"""
Whether to use product quantization in the index building process. Product quantization is an approximation
technique that can vastly speed up vector computations and comparisons in a spatial neighborhood, but it is still an
approximation technique. It should be preferred when index creation times take longer than you can afford for your
use case.
"""
NUM_PQ_BYTES = _defaults.NUM_PQ_BYTES
"""
The number of product quantization bytes to use. More bytes requires more resources in both memory and time, but is
like to result in better approximations.
"""
USE_OPQ = _defaults.USE_OPQ
""" Whether to use Optimized Product Quantization or not. """

View File

@@ -0,0 +1,167 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include "parameters.h"
#include "dynamic_memory_index.h"
#include "pybind11/numpy.h"
namespace diskannpy
{
diskann::IndexWriteParameters dynamic_index_write_parameters(const uint32_t complexity, const uint32_t graph_degree,
const bool saturate_graph,
const uint32_t max_occlusion_size, const float alpha,
const uint32_t num_threads,
const uint32_t filter_complexity)
{
return diskann::IndexWriteParametersBuilder(complexity, graph_degree)
.with_saturate_graph(saturate_graph)
.with_max_occlusion_size(max_occlusion_size)
.with_alpha(alpha)
.with_num_threads(num_threads)
.with_filter_list_size(filter_complexity)
.build();
}
template <class DT>
diskann::Index<DT, DynamicIdType, filterT> dynamic_index_builder(
const diskann::Metric m, const diskann::IndexWriteParameters &write_params, const size_t dimensions,
const size_t max_vectors, const uint32_t initial_search_complexity, const uint32_t initial_search_threads,
const bool concurrent_consolidation, const uint32_t num_frozen_points)
{
const uint32_t _initial_search_threads = initial_search_threads != 0 ? initial_search_threads : omp_get_num_procs();
auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, _initial_search_threads);
return diskann::Index<DT, DynamicIdType, filterT>(
m, dimensions, max_vectors,
std::make_shared<diskann::IndexWriteParameters>(write_params), // index write params
std::make_shared<diskann::IndexSearchParams>(index_search_params), // index_search_params
num_frozen_points, // frozen_points
true, // dynamic_index
true, // enable_tags
concurrent_consolidation,
false, // pq_dist_build
0, // num_pq_chunks
false); // use_opq = false
}
template <class DT>
DynamicMemoryIndex<DT>::DynamicMemoryIndex(const diskann::Metric m, const size_t dimensions, const size_t max_vectors,
const uint32_t complexity, const uint32_t graph_degree,
const bool saturate_graph, const uint32_t max_occlusion_size,
const float alpha, const uint32_t num_threads,
const uint32_t filter_complexity, const uint32_t num_frozen_points,
const uint32_t initial_search_complexity,
const uint32_t initial_search_threads, const bool concurrent_consolidation)
: _initial_search_complexity(initial_search_complexity != 0 ? initial_search_complexity : complexity),
_write_parameters(dynamic_index_write_parameters(complexity, graph_degree, saturate_graph, max_occlusion_size,
alpha, num_threads, filter_complexity)),
_index(dynamic_index_builder<DT>(m, _write_parameters, dimensions, max_vectors, _initial_search_complexity,
initial_search_threads, concurrent_consolidation, num_frozen_points))
{
}
template <class DT> void DynamicMemoryIndex<DT>::load(const std::string &index_path)
{
const std::string tags_file = index_path + ".tags";
if (!file_exists(tags_file))
{
throw std::runtime_error("tags file not found at expected path: " + tags_file);
}
_index.load(index_path.c_str(), _write_parameters.num_threads, _initial_search_complexity);
}
template <class DT>
int DynamicMemoryIndex<DT>::insert(const py::array_t<DT, py::array::c_style | py::array::forcecast> &vector,
const DynamicIdType id)
{
return _index.insert_point(vector.data(), id);
}
template <class DT>
py::array_t<int> DynamicMemoryIndex<DT>::batch_insert(
py::array_t<DT, py::array::c_style | py::array::forcecast> &vectors,
py::array_t<DynamicIdType, py::array::c_style | py::array::forcecast> &ids, const int32_t num_inserts,
const int num_threads)
{
if (num_threads == 0)
omp_set_num_threads(omp_get_num_procs());
else
omp_set_num_threads(num_threads);
py::array_t<int> insert_retvals(num_inserts);
#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_inserts, insert_retvals, vectors, ids)
for (int32_t i = 0; i < num_inserts; i++)
{
insert_retvals.mutable_data()[i] = _index.insert_point(vectors.data(i), *(ids.data(i)));
}
return insert_retvals;
}
template <class DT> int DynamicMemoryIndex<DT>::mark_deleted(const DynamicIdType id)
{
return this->_index.lazy_delete(id);
}
template <class DT> void DynamicMemoryIndex<DT>::save(const std::string &save_path, const bool compact_before_save)
{
if (save_path.empty())
{
throw std::runtime_error("A save_path must be provided");
}
_index.save(save_path.c_str(), compact_before_save);
}
template <class DT>
NeighborsAndDistances<DynamicIdType> DynamicMemoryIndex<DT>::search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity)
{
py::array_t<DynamicIdType> ids(knn);
py::array_t<float> dists(knn);
std::vector<DT *> empty_vector;
_index.search_with_tags(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data(), empty_vector);
return std::make_pair(ids, dists);
}
template <class DT>
NeighborsAndDistances<DynamicIdType> DynamicMemoryIndex<DT>::batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
const uint64_t complexity, const uint32_t num_threads)
{
py::array_t<DynamicIdType> ids({num_queries, knn});
py::array_t<float> dists({num_queries, knn});
std::vector<DT *> empty_vector;
if (num_threads == 0)
omp_set_num_threads(omp_get_num_procs());
else
omp_set_num_threads(static_cast<int32_t>(num_threads));
#pragma omp parallel for schedule(dynamic, 1) default(none) \
shared(num_queries, queries, knn, complexity, ids, dists, empty_vector)
for (int64_t i = 0; i < (int64_t)num_queries; i++)
{
_index.search_with_tags(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i),
empty_vector);
}
return std::make_pair(ids, dists);
}
template <class DT> void DynamicMemoryIndex<DT>::consolidate_delete()
{
_index.consolidate_deletes(_write_parameters);
}
template <class DT> size_t DynamicMemoryIndex<DT>::num_points()
{
return _index.get_num_points();
}
template class DynamicMemoryIndex<float>;
template class DynamicMemoryIndex<uint8_t>;
template class DynamicMemoryIndex<int8_t>;
}; // namespace diskannpy

View File

@@ -0,0 +1,142 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "defaults.h"
#include "distance.h"
#include "builder.h"
#include "dynamic_memory_index.h"
#include "static_disk_index.h"
#include "static_memory_index.h"
PYBIND11_MAKE_OPAQUE(std::vector<uint32_t>);
PYBIND11_MAKE_OPAQUE(std::vector<float>);
PYBIND11_MAKE_OPAQUE(std::vector<int8_t>);
PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>);
namespace py = pybind11;
using namespace pybind11::literals;
struct Variant
{
std::string disk_builder_name;
std::string memory_builder_name;
std::string dynamic_memory_index_name;
std::string static_memory_index_name;
std::string static_disk_index_name;
};
const Variant FloatVariant{"build_disk_float_index", "build_memory_float_index", "DynamicMemoryFloatIndex",
"StaticMemoryFloatIndex", "StaticDiskFloatIndex"};
const Variant UInt8Variant{"build_disk_uint8_index", "build_memory_uint8_index", "DynamicMemoryUInt8Index",
"StaticMemoryUInt8Index", "StaticDiskUInt8Index"};
const Variant Int8Variant{"build_disk_int8_index", "build_memory_int8_index", "DynamicMemoryInt8Index",
"StaticMemoryInt8Index", "StaticDiskInt8Index"};
template <typename T> inline void add_variant(py::module_ &m, const Variant &variant)
{
m.def(variant.disk_builder_name.c_str(), &diskannpy::build_disk_index<T>, "distance_metric"_a, "data_file_path"_a,
"index_prefix_path"_a, "complexity"_a, "graph_degree"_a, "final_index_ram_limit"_a, "indexing_ram_budget"_a,
"num_threads"_a, "pq_disk_bytes"_a, "codebook_prefix"_a = "");
m.def(variant.memory_builder_name.c_str(), &diskannpy::build_memory_index<T>, "distance_metric"_a,
"data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a,
"use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "use_tags"_a = false, "filter_labels_file"_a = "",
"universal_label"_a = "", "filter_complexity"_a = 0);
py::class_<diskannpy::StaticMemoryIndex<T>>(m, variant.static_memory_index_name.c_str())
.def(py::init<const diskann::Metric, const std::string &, const size_t, const size_t, const uint32_t,
const uint32_t>(),
"distance_metric"_a, "index_path"_a, "num_points"_a, "dimensions"_a, "num_threads"_a,
"initial_search_complexity"_a)
.def("search", &diskannpy::StaticMemoryIndex<T>::search, "query"_a, "knn"_a, "complexity"_a)
.def("search_with_filter", &diskannpy::StaticMemoryIndex<T>::search_with_filter, "query"_a, "knn"_a,
"complexity"_a, "filter"_a)
.def("batch_search", &diskannpy::StaticMemoryIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
"complexity"_a, "num_threads"_a);
py::class_<diskannpy::DynamicMemoryIndex<T>>(m, variant.dynamic_memory_index_name.c_str())
.def(py::init<const diskann::Metric, const size_t, const size_t, const uint32_t, const uint32_t, const bool,
const uint32_t, const float, const uint32_t, const uint32_t, const uint32_t, const uint32_t,
const uint32_t, const bool>(),
"distance_metric"_a, "dimensions"_a, "max_vectors"_a, "complexity"_a, "graph_degree"_a,
"saturate_graph"_a = diskann::defaults::SATURATE_GRAPH,
"max_occlusion_size"_a = diskann::defaults::MAX_OCCLUSION_SIZE, "alpha"_a = diskann::defaults::ALPHA,
"num_threads"_a = diskann::defaults::NUM_THREADS,
"filter_complexity"_a = diskann::defaults::FILTER_LIST_SIZE,
"num_frozen_points"_a = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC, "initial_search_complexity"_a = 0,
"search_threads"_a = 0, "concurrent_consolidation"_a = true)
.def("search", &diskannpy::DynamicMemoryIndex<T>::search, "query"_a, "knn"_a, "complexity"_a)
.def("load", &diskannpy::DynamicMemoryIndex<T>::load, "index_path"_a)
.def("batch_search", &diskannpy::DynamicMemoryIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
"complexity"_a, "num_threads"_a)
.def("batch_insert", &diskannpy::DynamicMemoryIndex<T>::batch_insert, "vectors"_a, "ids"_a, "num_inserts"_a,
"num_threads"_a)
.def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
.def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
.def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete)
.def("num_points", &diskannpy::DynamicMemoryIndex<T>::num_points);
py::class_<diskannpy::StaticDiskIndex<T>>(m, variant.static_disk_index_name.c_str())
.def(py::init<const diskann::Metric, const std::string &, const uint32_t, const size_t, const uint32_t,
const std::string &, const std::string &>(),
"distance_metric"_a, "index_path_prefix"_a, "num_threads"_a, "num_nodes_to_cache"_a,
"cache_mechanism"_a = 1, "pq_prefix"_a = "", "partition_prefix"_a)
.def("cache_bfs_levels", &diskannpy::StaticDiskIndex<T>::cache_bfs_levels, "num_nodes_to_cache"_a)
.def("search", &diskannpy::StaticDiskIndex<T>::search, "query"_a, "knn"_a, "complexity"_a, "beam_width"_a,
"USE_DEFERRED_FETCH"_a = false, "skip_search_reorder"_a = false, "recompute_beighbor_embeddings"_a = false,
"dedup_node_dis"_a = false, "prune_ratio"_a = 0, "batch_recompute"_a = false, "global_pruning"_a = false)
.def("batch_search", &diskannpy::StaticDiskIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
"complexity"_a, "beam_width"_a, "num_threads"_a, "USE_DEFERRED_FETCH"_a = false,
"skip_search_reorder"_a = false, "recompute_beighbor_embeddings"_a = false, "dedup_node_dis"_a = false,
"prune_ratio"_a = 0, "batch_recompute"_a = false, "global_pruning"_a = false);
}
PYBIND11_MODULE(_diskannpy, m)
{
m.doc() = "DiskANN Python Bindings";
#ifdef VERSION_INFO
m.attr("__version__") = VERSION_INFO;
#else
m.attr("__version__") = "dev";
#endif
// let's re-export our defaults
py::module_ default_values = m.def_submodule(
"defaults",
"A collection of the default values used for common diskann operations. `GRAPH_DEGREE` and `COMPLEXITY` are not"
" set as defaults, but some semi-reasonable default values are selected for your convenience. We urge you to "
"investigate their meaning and adjust them for your use cases.");
default_values.attr("ALPHA") = diskann::defaults::ALPHA;
default_values.attr("NUM_THREADS") = diskann::defaults::NUM_THREADS;
default_values.attr("MAX_OCCLUSION_SIZE") = diskann::defaults::MAX_OCCLUSION_SIZE;
default_values.attr("FILTER_COMPLEXITY") = diskann::defaults::FILTER_LIST_SIZE;
default_values.attr("NUM_FROZEN_POINTS_STATIC") = diskann::defaults::NUM_FROZEN_POINTS_STATIC;
default_values.attr("NUM_FROZEN_POINTS_DYNAMIC") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC;
default_values.attr("SATURATE_GRAPH") = diskann::defaults::SATURATE_GRAPH;
default_values.attr("GRAPH_DEGREE") = diskann::defaults::MAX_DEGREE;
default_values.attr("COMPLEXITY") = diskann::defaults::BUILD_LIST_SIZE;
default_values.attr("PQ_DISK_BYTES") = (uint32_t)0;
default_values.attr("USE_PQ_BUILD") = false;
default_values.attr("NUM_PQ_BYTES") = (uint32_t)0;
default_values.attr("USE_OPQ") = false;
add_variant<float>(m, FloatVariant);
add_variant<uint8_t>(m, UInt8Variant);
add_variant<int8_t>(m, Int8Variant);
py::enum_<diskann::Metric>(m, "Metric")
.value("L2", diskann::Metric::L2)
.value("INNER_PRODUCT", diskann::Metric::INNER_PRODUCT)
.value("COSINE", diskann::Metric::COSINE)
.export_values();
}

View File

View File

@@ -0,0 +1,123 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include "static_disk_index.h"
#include "pybind11/numpy.h"
namespace diskannpy
{
template <typename DT>
StaticDiskIndex<DT>::StaticDiskIndex(const diskann::Metric metric, const std::string &index_path_prefix,
const uint32_t num_threads, const size_t num_nodes_to_cache,
const uint32_t cache_mechanism, const std::string &pq_prefix,
const std::string &partition_prefix)
: _reader(std::make_shared<PlatformSpecificAlignedFileReader>()),
_graph_reader(std::make_shared<PlatformSpecificAlignedFileReader>()), _index(_reader, _graph_reader, metric)
{
std::cout << "Before index load" << std::endl;
const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
int load_success =
_index.load(_num_threads, index_path_prefix.c_str(), pq_prefix.c_str(), partition_prefix.c_str());
if (load_success != 0)
{
throw std::runtime_error("index load failed, " + index_path_prefix);
}
if (cache_mechanism == 1)
{
std::string sample_file = index_path_prefix + std::string("_sample_data.bin");
cache_sample_paths(num_nodes_to_cache, sample_file, _num_threads);
}
else if (cache_mechanism == 2)
{
cache_bfs_levels(num_nodes_to_cache);
}
std::cout << "After index load" << std::endl;
}
template <typename DT> void StaticDiskIndex<DT>::cache_bfs_levels(const size_t num_nodes_to_cache)
{
std::vector<uint32_t> node_list;
_index.cache_bfs_levels(num_nodes_to_cache, node_list);
_index.load_cache_list(node_list);
}
template <typename DT>
void StaticDiskIndex<DT>::cache_sample_paths(const size_t num_nodes_to_cache, const std::string &warmup_query_file,
const uint32_t num_threads)
{
if (!file_exists(warmup_query_file))
{
return;
}
std::vector<uint32_t> node_list;
_index.generate_cache_list_from_sample_queries(warmup_query_file, 15, 4, num_nodes_to_cache, num_threads,
node_list);
_index.load_cache_list(node_list);
}
template <typename DT>
NeighborsAndDistances<StaticIdType> StaticDiskIndex<DT>::search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity,
const uint64_t beam_width, const bool USE_DEFERRED_FETCH, const bool skip_search_reorder,
const bool recompute_beighbor_embeddings, const bool dedup_node_dis, const float prune_ratio,
const bool batch_recompute, const bool global_pruning)
{
py::array_t<StaticIdType> ids(knn);
py::array_t<float> dists(knn);
std::vector<uint32_t> u32_ids(knn);
std::vector<uint64_t> u64_ids(knn);
diskann::QueryStats stats;
_index.cached_beam_search(query.data(), knn, complexity, u64_ids.data(), dists.mutable_data(), beam_width, false,
&stats, USE_DEFERRED_FETCH, skip_search_reorder, recompute_beighbor_embeddings,
dedup_node_dis, prune_ratio, batch_recompute, global_pruning);
auto r = ids.mutable_unchecked<1>();
for (uint64_t i = 0; i < knn; ++i)
r(i) = (unsigned)u64_ids[i];
return std::make_pair(ids, dists);
}
template <typename DT>
NeighborsAndDistances<StaticIdType> StaticDiskIndex<DT>::batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
const uint64_t complexity, const uint64_t beam_width, const uint32_t num_threads, const bool USE_DEFERRED_FETCH,
const bool skip_search_reorder, const bool recompute_beighbor_embeddings, const bool dedup_node_dis,
const float prune_ratio, const bool batch_recompute, const bool global_pruning)
{
py::array_t<StaticIdType> ids({num_queries, knn});
py::array_t<float> dists({num_queries, knn});
omp_set_num_threads(num_threads);
std::vector<uint64_t> u64_ids(knn * num_queries);
#pragma omp parallel for schedule(dynamic, 1) default(none) \
shared(num_queries, queries, knn, complexity, u64_ids, dists, beam_width, USE_DEFERRED_FETCH, skip_search_reorder, \
recompute_beighbor_embeddings, dedup_node_dis, prune_ratio, batch_recompute, global_pruning)
for (int64_t i = 0; i < (int64_t)num_queries; i++)
{
_index.cached_beam_search(queries.data(i), knn, complexity, u64_ids.data() + i * knn, dists.mutable_data(i),
beam_width, false, nullptr, USE_DEFERRED_FETCH, skip_search_reorder,
recompute_beighbor_embeddings, dedup_node_dis, prune_ratio, batch_recompute,
global_pruning);
}
auto r = ids.mutable_unchecked();
for (uint64_t i = 0; i < num_queries; ++i)
for (uint64_t j = 0; j < knn; ++j)
r(i, j) = (uint32_t)u64_ids[i * knn + j];
return std::make_pair(ids, dists);
}
template class StaticDiskIndex<float>;
template class StaticDiskIndex<uint8_t>;
template class StaticDiskIndex<int8_t>;
} // namespace diskannpy

View File

@@ -0,0 +1,91 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include "static_memory_index.h"
#include "pybind11/numpy.h"
namespace diskannpy
{
template <class DT>
diskann::Index<DT, StaticIdType, filterT> static_index_builder(const diskann::Metric m, const size_t num_points,
const size_t dimensions,
const uint32_t initial_search_complexity)
{
if (initial_search_complexity == 0)
{
throw std::runtime_error("initial_search_complexity must be a positive uint32_t");
}
auto index_search_params = diskann::IndexSearchParams(initial_search_complexity, omp_get_num_procs());
return diskann::Index<DT>(m, dimensions, num_points,
nullptr, // index write params
std::make_shared<diskann::IndexSearchParams>(index_search_params), // index search params
0, // num frozen points
false, // not a dynamic_index
false, // no enable_tags/ids
false, // no concurrent_consolidate,
false, // pq_dist_build
0, // num_pq_chunks
false); // use_opq = false
}
template <class DT>
StaticMemoryIndex<DT>::StaticMemoryIndex(const diskann::Metric m, const std::string &index_prefix,
const size_t num_points, const size_t dimensions, const uint32_t num_threads,
const uint32_t initial_search_complexity)
: _index(static_index_builder<DT>(m, num_points, dimensions, initial_search_complexity))
{
const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
_index.load(index_prefix.c_str(), _num_threads, initial_search_complexity);
}
template <typename DT>
NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity)
{
py::array_t<StaticIdType> ids(knn);
py::array_t<float> dists(knn);
std::vector<DT *> empty_vector;
_index.search(query.data(), knn, complexity, ids.mutable_data(), dists.mutable_data());
return std::make_pair(ids, dists);
}
template <typename DT>
NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::search_with_filter(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, const uint64_t knn, const uint64_t complexity,
const filterT filter)
{
py::array_t<StaticIdType> ids(knn);
py::array_t<float> dists(knn);
std::vector<DT *> empty_vector;
_index.search_with_filters(query.data(), filter, knn, complexity, ids.mutable_data(), dists.mutable_data());
return std::make_pair(ids, dists);
}
template <typename DT>
NeighborsAndDistances<StaticIdType> StaticMemoryIndex<DT>::batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, const uint64_t num_queries, const uint64_t knn,
const uint64_t complexity, const uint32_t num_threads)
{
const uint32_t _num_threads = num_threads != 0 ? num_threads : omp_get_num_procs();
py::array_t<StaticIdType> ids({num_queries, knn});
py::array_t<float> dists({num_queries, knn});
std::vector<DT *> empty_vector;
omp_set_num_threads(static_cast<int32_t>(_num_threads));
#pragma omp parallel for schedule(dynamic, 1) default(none) shared(num_queries, queries, knn, complexity, ids, dists)
for (int64_t i = 0; i < (int64_t)num_queries; i++)
{
_index.search(queries.data(i), knn, complexity, ids.mutable_data(i), dists.mutable_data(i));
}
return std::make_pair(ids, dists);
}
template class StaticMemoryIndex<float>;
template class StaticMemoryIndex<uint8_t>;
template class StaticMemoryIndex<int8_t>;
} // namespace diskannpy

View File

@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from .build_memory_index import build_random_vectors_and_memory_index
from .create_test_data import random_vectors, vectors_as_temp_file, write_vectors
from .recall import calculate_recall

View File

@@ -0,0 +1,51 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import os
from tempfile import mkdtemp
import diskannpy as dap
import numpy as np
from .create_test_data import random_vectors
def build_random_vectors_and_memory_index(
dtype, metric, with_tags: bool = False, index_prefix: str = "ann", seed: int = 12345
):
query_vectors: np.ndarray = random_vectors(1000, 10, dtype=dtype, seed=seed)
index_vectors: np.ndarray = random_vectors(10000, 10, dtype=dtype, seed=seed)
ann_dir = mkdtemp()
if with_tags:
rng = np.random.default_rng(seed)
tags = np.arange(start=1, stop=10001, dtype=np.uint32)
rng.shuffle(tags)
else:
tags = ""
dap.build_memory_index(
data=index_vectors,
distance_metric=metric,
index_directory=ann_dir,
graph_degree=16,
complexity=32,
alpha=1.2,
num_threads=0,
use_pq_build=False,
num_pq_bytes=8,
use_opq=False,
filter_complexity=32,
tags=tags,
index_prefix=index_prefix,
)
return (
metric,
dtype,
query_vectors,
index_vectors,
ann_dir,
os.path.join(ann_dir, "vectors.bin"),
tags,
)

View File

@@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
from contextlib import contextmanager
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import BinaryIO
import numpy as np
def random_vectors(rows: int, dimensions: int, dtype, seed: int = 12345) -> np.ndarray:
rng = np.random.default_rng(seed)
if dtype == np.float32:
vectors = rng.random((rows, dimensions), dtype=dtype)
elif dtype == np.uint8:
vectors = rng.integers(
low=0, high=256, size=(rows, dimensions), dtype=dtype
) # low is inclusive, high is exclusive
elif dtype == np.int8:
vectors = rng.integers(
low=-128, high=128, size=(rows, dimensions), dtype=dtype
) # low is inclusive, high is exclusive
else:
raise RuntimeError("Only np.float32, np.int8, and np.uint8 are supported")
return vectors
def write_vectors(file_handler: BinaryIO, vectors: np.ndarray):
_ = file_handler.write(np.array(vectors.shape, dtype=np.int32).tobytes())
_ = file_handler.write(vectors.tobytes())
@contextmanager
def vectors_as_temp_file(vectors: np.ndarray) -> str:
temp = NamedTemporaryFile(mode="wb", delete=False)
write_vectors(temp, vectors)
temp.close()
yield temp.name
Path(temp.name).unlink()

View File

@@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import numpy as np
def calculate_recall(
result_set_indices: np.ndarray, truth_set_indices: np.ndarray, recall_at: int = 5
) -> float:
"""
result_set_indices and truth_set_indices correspond by row index. the columns in each row contain the indices of
the nearest neighbors, with result_set_indices being the approximate nearest neighbor results and truth_set_indices
being the brute force nearest neighbor calculation via sklearn's NearestNeighbor class.
:param result_set_indices:
:param truth_set_indices:
:param recall_at:
:return:
"""
found = 0
for i in range(0, result_set_indices.shape[0]):
result_set_set = set(result_set_indices[i][0:recall_at])
truth_set_set = set(truth_set_indices[i][0:recall_at])
found += len(result_set_set.intersection(truth_set_set))
return found / (result_set_indices.shape[0] * recall_at)