Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
<!-- Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT license. -->
# Integration Tests
The following tests use Python to prepare, run, verify, and tear down the rest api services.
We do make use of the built-in `unittest` library, but that's only to take advantage of test reporting purposes.
These are decidedly **not** _unit_ tests. These are end to end integration tests.
## Caveats
This has only been tested or built for Linux, though we have written platform agnostic Python for the smoke test
(i.e. using `os.path.join`, etc)
It has been tested on Python 3.9 and 3.10, but should work on Python 3.6+.
## How to Run
First, build the DiskANN RestAPI code; see $REPOSITORY_ROOT/workflows/rest_api.md for detailed instructions.
```bash
cd tests/python
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export DISKANN_BUILD_DIR=/path/to/your/diskann/build
python -m unittest
```
## Smoke Test Failed, Now What?
The smoke test written takes advantage of temporary directories that are only valid during the
lifetime of the test. The contents of these directories include:
- Randomized vectors (first in tsv, then bin form) used to build the PQFlashIndex
- The PQFlashIndex files
It is useful to keep these around. By setting some environment variables, you can control whether an ephemeral,
temporary directory is used (and deleted on test completion), or left as an exercise for the developer to
clean up.
The valid environment variables are:
- `DISKANN_REST_TEST_WORKING_DIR` (example: `$USER/DiskANNRestTest`)
- If this is specified, it **must exist** and **must be writeable**. Any existing files will be clobbered.
- `DISKANN_REST_SERVER` (example: `http://127.0.0.1:10067`)
- Note that if this is set, no data will be generated, nor will a server be started; it is presumed you have done
all the work in creating and starting the rest server prior to running the test and just submits requests against it.

View File

@@ -0,0 +1,67 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
import numpy as np
import os
import subprocess
def output_vectors(
diskann_build_path: str,
temporary_file_path: str,
vectors: np.ndarray,
timeout: int = 60
) -> str:
vectors_as_tsv_path = os.path.join(temporary_file_path, "vectors.tsv")
with open(vectors_as_tsv_path, "w") as vectors_tsv_out:
for vector in vectors:
as_str = "\t".join((str(component) for component in vector))
print(as_str, file=vectors_tsv_out)
# there is probably a clever way to have numpy write out C++ friendly floats, so feel free to remove this in
# favor of something more sane later
vectors_as_bin_path = os.path.join(temporary_file_path, "vectors.bin")
tsv_to_bin_path = os.path.join(diskann_build_path, "apps", "utils", "tsv_to_bin")
number_of_points, dimensions = vectors.shape
args = [
tsv_to_bin_path,
"float",
vectors_as_tsv_path,
vectors_as_bin_path,
str(dimensions),
str(number_of_points)
]
completed = subprocess.run(args, timeout=timeout)
if completed.returncode != 0:
raise Exception(f"Unable to convert tsv to binary using tsv_to_bin, completed_process: {completed}")
return vectors_as_bin_path
def build_ssd_index(
diskann_build_path: str,
temporary_file_path: str,
vectors: np.ndarray,
per_process_timeout: int = 60 # this may not be long enough if you're doing something larger
):
vectors_as_bin_path = output_vectors(diskann_build_path, temporary_file_path, vectors, timeout=per_process_timeout)
ssd_builder_path = os.path.join(diskann_build_path, "apps", "build_disk_index")
args = [
ssd_builder_path,
"--data_type", "float",
"--dist_fn", "l2",
"--data_path", vectors_as_bin_path,
"--index_path_prefix", os.path.join(temporary_file_path, "smoke_test"),
"-R", "64",
"-L", "100",
"--search_DRAM_budget", "1",
"--build_DRAM_budget", "1",
"--num_threads", "1",
"--PQ_disk_bytes", "0"
]
completed = subprocess.run(args, timeout=per_process_timeout)
if completed.returncode != 0:
command_run = " ".join(args)
raise Exception(f"Unable to build a disk index with the command: '{command_run}'\ncompleted_process: {completed}\nstdout: {completed.stdout}\nstderr: {completed.stderr}")
# index is now built inside of temporary_file_path