Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,110 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
add_executable(fvecs_to_bin fvecs_to_bin.cpp)
add_executable(fvecs_to_bvecs fvecs_to_bvecs.cpp)
add_executable(rand_data_gen rand_data_gen.cpp)
target_link_libraries(rand_data_gen ${PROJECT_NAME} Boost::program_options)
add_executable(float_bin_to_int8 float_bin_to_int8.cpp)
add_executable(ivecs_to_bin ivecs_to_bin.cpp)
add_executable(count_bfs_levels count_bfs_levels.cpp)
target_link_libraries(count_bfs_levels ${PROJECT_NAME} Boost::program_options)
add_executable(tsv_to_bin tsv_to_bin.cpp)
add_executable(bin_to_tsv bin_to_tsv.cpp)
add_executable(int8_to_float int8_to_float.cpp)
target_link_libraries(int8_to_float ${PROJECT_NAME})
add_executable(int8_to_float_scale int8_to_float_scale.cpp)
target_link_libraries(int8_to_float_scale ${PROJECT_NAME})
add_executable(uint8_to_float uint8_to_float.cpp)
target_link_libraries(uint8_to_float ${PROJECT_NAME})
add_executable(uint32_to_uint8 uint32_to_uint8.cpp)
target_link_libraries(uint32_to_uint8 ${PROJECT_NAME})
add_executable(vector_analysis vector_analysis.cpp)
target_link_libraries(vector_analysis ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(gen_random_slice gen_random_slice.cpp)
target_link_libraries(gen_random_slice ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(simulate_aggregate_recall simulate_aggregate_recall.cpp)
add_executable(calculate_recall calculate_recall.cpp)
target_link_libraries(calculate_recall ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
# Compute ground truth thing outside of DiskANN main source that depends on MKL.
add_executable(compute_groundtruth compute_groundtruth.cpp)
target_include_directories(compute_groundtruth PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
target_link_libraries(compute_groundtruth ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
add_executable(compute_groundtruth_for_filters compute_groundtruth_for_filters.cpp)
target_include_directories(compute_groundtruth_for_filters PRIVATE ${DISKANN_MKL_INCLUDE_DIRECTORIES})
target_link_libraries(compute_groundtruth_for_filters ${PROJECT_NAME} ${DISKANN_MKL_LINK_LIBRARIES} ${DISKANN_ASYNC_LIB} Boost::program_options)
add_executable(generate_pq generate_pq.cpp)
target_link_libraries(generate_pq ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(partition_data partition_data.cpp)
target_link_libraries(partition_data ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(partition_with_ram_budget partition_with_ram_budget.cpp)
target_link_libraries(partition_with_ram_budget ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(merge_shards merge_shards.cpp)
target_link_libraries(merge_shards ${PROJECT_NAME} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS} ${DISKANN_ASYNC_LIB})
add_executable(create_disk_layout create_disk_layout.cpp)
target_link_libraries(create_disk_layout ${PROJECT_NAME} ${DISKANN_ASYNC_LIB} ${DISKANN_TOOLS_TCMALLOC_LINK_OPTIONS})
add_executable(generate_synthetic_labels generate_synthetic_labels.cpp)
target_link_libraries(generate_synthetic_labels ${PROJECT_NAME} Boost::program_options)
add_executable(stats_label_data stats_label_data.cpp)
target_link_libraries(stats_label_data ${PROJECT_NAME} Boost::program_options)
if (NOT MSVC)
include(GNUInstallDirs)
install(TARGETS fvecs_to_bin
fvecs_to_bvecs
rand_data_gen
float_bin_to_int8
ivecs_to_bin
count_bfs_levels
tsv_to_bin
bin_to_tsv
int8_to_float
int8_to_float_scale
uint8_to_float
uint32_to_uint8
vector_analysis
gen_random_slice
simulate_aggregate_recall
calculate_recall
compute_groundtruth
compute_groundtruth_for_filters
generate_pq
partition_data
partition_with_ram_budget
merge_shards
create_disk_layout
generate_synthetic_labels
stats_label_data
RUNTIME
)
endif()

View File

@@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "util.h"
void block_convert(std::ifstream &writr, std::ofstream &readr, float *read_buf, float *write_buf, uint64_t npts,
uint64_t ndims)
{
writr.write((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(unsigned)));
#pragma omp parallel for
for (uint64_t i = 0; i < npts; i++)
{
memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(float));
}
readr.read((char *)write_buf, npts * ndims * sizeof(float));
}
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_bin output_fvecs" << std::endl;
exit(-1);
}
std::ifstream readr(argv[1], std::ios::binary);
int npts_s32;
int ndims_s32;
readr.read((char *)&npts_s32, sizeof(int32_t));
readr.read((char *)&ndims_s32, sizeof(int32_t));
size_t npts = npts_s32;
size_t ndims = ndims_s32;
uint32_t ndims_u32 = (uint32_t)ndims_s32;
// uint64_t fsize = writr.tellg();
readr.seekg(0, std::ios::beg);
unsigned ndims_u32;
writr.write((char *)&ndims_u32, sizeof(unsigned));
writr.seekg(0, std::ios::beg);
uint64_t ndims = (uint64_t)ndims_u32;
uint64_t npts = fsize / ((ndims + 1) * sizeof(float));
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
uint64_t blk_size = 131072;
uint64_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writr(argv[2], std::ios::binary);
float *read_buf = new float[npts * (ndims + 1)];
float *write_buf = new float[npts * ndims];
for (uint64_t i = 0; i < nblks; i++)
{
uint64_t cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(writr, readr, read_buf, write_buf, cblk_size, ndims);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
writr.close();
readr.close();
}

View File

@@ -0,0 +1,69 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
template <class T>
void block_convert(std::ofstream &writer, std::ifstream &reader, T *read_buf, size_t npts, size_t ndims)
{
reader.read((char *)read_buf, npts * ndims * sizeof(float));
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; d++)
{
writer << read_buf[d + i * ndims];
if (d < ndims - 1)
writer << "\t";
else
writer << "\n";
}
}
}
int main(int argc, char **argv)
{
if (argc != 4)
{
std::cout << argv[0] << " <float/int8/uint8> input_bin output_tsv" << std::endl;
exit(-1);
}
std::string type_string(argv[1]);
if ((type_string != std::string("float")) && (type_string != std::string("int8")) &&
(type_string != std::string("uin8")))
{
std::cerr << "Error: type not supported. Use float/int8/uint8" << std::endl;
}
std::ifstream reader(argv[2], std::ios::binary);
uint32_t npts_u32;
uint32_t ndims_u32;
reader.read((char *)&npts_u32, sizeof(uint32_t));
reader.read((char *)&ndims_u32, sizeof(uint32_t));
size_t npts = npts_u32;
size_t ndims = ndims_u32;
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::ofstream writer(argv[3]);
char *read_buf = new char[blk_size * ndims * 4];
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (type_string == std::string("float"))
block_convert<float>(writer, reader, (float *)read_buf, cblk_size, ndims);
else if (type_string == std::string("int8"))
block_convert<int8_t>(writer, reader, (int8_t *)read_buf, cblk_size, ndims);
else if (type_string == std::string("uint8"))
block_convert<uint8_t>(writer, reader, (uint8_t *)read_buf, cblk_size, ndims);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
writer.close();
reader.close();
}

View File

@@ -0,0 +1,55 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <cstddef>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include "utils.h"
#include "disk_utils.h"
int main(int argc, char **argv)
{
if (argc != 4)
{
std::cout << argv[0] << " <ground_truth_bin> <our_results_bin> <r> " << std::endl;
return -1;
}
uint32_t *gold_std = NULL;
float *gs_dist = nullptr;
uint32_t *our_results = NULL;
float *or_dist = nullptr;
size_t points_num, points_num_gs, points_num_or;
size_t dim_gs;
size_t dim_or;
diskann::load_truthset(argv[1], gold_std, gs_dist, points_num_gs, dim_gs);
diskann::load_truthset(argv[2], our_results, or_dist, points_num_or, dim_or);
if (points_num_gs != points_num_or)
{
std::cout << "Error. Number of queries mismatch in ground truth and "
"our results"
<< std::endl;
return -1;
}
points_num = points_num_gs;
uint32_t recall_at = std::atoi(argv[3]);
if ((dim_or < recall_at) || (recall_at > dim_gs))
{
std::cout << "ground truth has size " << dim_gs << "; our set has " << dim_or << " points. Asking for recall "
<< recall_at << std::endl;
return -1;
}
std::cout << "Calculating recall@" << recall_at << std::endl;
double recall_val = diskann::calculate_recall((uint32_t)points_num, gold_std, gs_dist, (uint32_t)dim_gs,
our_results, (uint32_t)dim_or, (uint32_t)recall_at);
// double avg_recall = (recall*1.0)/(points_num*1.0);
std::cout << "Avg. recall@" << recall_at << " is " << recall_val << "\n";
}

View File

@@ -0,0 +1,574 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <string>
#include <iostream>
#include <fstream>
#include <cassert>
#include <vector>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <random>
#include <limits>
#include <cstring>
#include <queue>
#include <omp.h>
#include <mkl.h>
#include <boost/program_options.hpp>
#include <unordered_map>
#include <tsl/robin_map.h>
#include <tsl/robin_set.h>
#ifdef _WINDOWS
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include "filter_utils.h"
#include "utils.h"
// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
#define PARTSIZE 10000000
#define ALIGNMENT 512
// custom types (for readability)
typedef tsl::robin_set<std::string> label_set;
typedef std::string path;
namespace po = boost::program_options;
template <class T> T div_round_up(const T numerator, const T denominator)
{
return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
}
using pairIF = std::pair<size_t, float>;
struct cmpmaxstruct
{
bool operator()(const pairIF &l, const pairIF &r)
{
return l.second < r.second;
};
};
using maxPQIFCS = std::priority_queue<pairIF, std::vector<pairIF>, cmpmaxstruct>;
template <class T> T *aligned_malloc(const size_t n, const size_t alignment)
{
#ifdef _WINDOWS
return (T *)_aligned_malloc(sizeof(T) * n, alignment);
#else
return static_cast<T *>(aligned_alloc(alignment, sizeof(T) * n));
#endif
}
inline bool custom_dist(const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b)
{
return a.second < b.second;
}
void compute_l2sq(float *const points_l2sq, const float *const matrix, const int64_t num_points, const uint64_t dim)
{
assert(points_l2sq != NULL);
#pragma omp parallel for schedule(static, 65536)
for (int64_t d = 0; d < num_points; ++d)
points_l2sq[d] = cblas_sdot((int64_t)dim, matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1,
matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1);
}
void distsq_to_points(const size_t dim,
float *dist_matrix, // Col Major, cols are queries, rows are points
size_t npoints, const float *const points,
const float *const points_l2sq, // points in Col major
size_t nqueries, const float *const queries,
const float *const queries_l2sq, // queries in Col major
float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
{
bool ones_vec_alloc = false;
if (ones_vec == NULL)
{
ones_vec = new float[nqueries > npoints ? nqueries : npoints];
std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
ones_vec_alloc = true;
}
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-2.0, points, dim, queries, dim,
(float)0.0, dist_matrix, npoints);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, points_l2sq, npoints,
ones_vec, nqueries, (float)1.0, dist_matrix, npoints);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, ones_vec, npoints,
queries_l2sq, nqueries, (float)1.0, dist_matrix, npoints);
if (ones_vec_alloc)
delete[] ones_vec;
}
void inner_prod_to_points(const size_t dim,
float *dist_matrix, // Col Major, cols are queries, rows are points
size_t npoints, const float *const points, size_t nqueries, const float *const queries,
float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
{
bool ones_vec_alloc = false;
if (ones_vec == NULL)
{
ones_vec = new float[nqueries > npoints ? nqueries : npoints];
std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
ones_vec_alloc = true;
}
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-1.0, points, dim, queries, dim,
(float)0.0, dist_matrix, npoints);
if (ones_vec_alloc)
delete[] ones_vec;
}
void exact_knn(const size_t dim, const size_t k,
size_t *const closest_points, // k * num_queries preallocated, col
// major, queries columns
float *const dist_closest_points, // k * num_queries
// preallocated, Dist to
// corresponding closes_points
size_t npoints,
float *points_in, // points in Col major
size_t nqueries, float *queries_in,
diskann::Metric metric = diskann::Metric::L2) // queries in Col major
{
float *points_l2sq = new float[npoints];
float *queries_l2sq = new float[nqueries];
compute_l2sq(points_l2sq, points_in, npoints, dim);
compute_l2sq(queries_l2sq, queries_in, nqueries, dim);
float *points = points_in;
float *queries = queries_in;
if (metric == diskann::Metric::COSINE)
{ // we convert cosine distance as
// normalized L2 distnace
points = new float[npoints * dim];
queries = new float[nqueries * dim];
#pragma omp parallel for schedule(static, 4096)
for (int64_t i = 0; i < (int64_t)npoints; i++)
{
float norm = std::sqrt(points_l2sq[i]);
if (norm == 0)
{
norm = std::numeric_limits<float>::epsilon();
}
for (uint32_t j = 0; j < dim; j++)
{
points[i * dim + j] = points_in[i * dim + j] / norm;
}
}
#pragma omp parallel for schedule(static, 4096)
for (int64_t i = 0; i < (int64_t)nqueries; i++)
{
float norm = std::sqrt(queries_l2sq[i]);
if (norm == 0)
{
norm = std::numeric_limits<float>::epsilon();
}
for (uint32_t j = 0; j < dim; j++)
{
queries[i * dim + j] = queries_in[i * dim + j] / norm;
}
}
// recalculate norms after normalizing, they should all be one.
compute_l2sq(points_l2sq, points, npoints, dim);
compute_l2sq(queries_l2sq, queries, nqueries, dim);
}
std::cout << "Going to compute " << k << " NNs for " << nqueries << " queries over " << npoints << " points in "
<< dim << " dimensions using";
if (metric == diskann::Metric::INNER_PRODUCT)
std::cout << " MIPS ";
else if (metric == diskann::Metric::COSINE)
std::cout << " Cosine ";
else
std::cout << " L2 ";
std::cout << "distance fn. " << std::endl;
size_t q_batch_size = (1 << 9);
float *dist_matrix = new float[(size_t)q_batch_size * (size_t)npoints];
for (size_t b = 0; b < div_round_up(nqueries, q_batch_size); ++b)
{
int64_t q_b = b * q_batch_size;
int64_t q_e = ((b + 1) * q_batch_size > nqueries) ? nqueries : (b + 1) * q_batch_size;
if (metric == diskann::Metric::L2 || metric == diskann::Metric::COSINE)
{
distsq_to_points(dim, dist_matrix, npoints, points, points_l2sq, q_e - q_b,
queries + (ptrdiff_t)q_b * (ptrdiff_t)dim, queries_l2sq + q_b);
}
else
{
inner_prod_to_points(dim, dist_matrix, npoints, points, q_e - q_b,
queries + (ptrdiff_t)q_b * (ptrdiff_t)dim);
}
std::cout << "Computed distances for queries: [" << q_b << "," << q_e << ")" << std::endl;
#pragma omp parallel for schedule(dynamic, 16)
for (long long q = q_b; q < q_e; q++)
{
maxPQIFCS point_dist;
for (size_t p = 0; p < k; p++)
point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
for (size_t p = k; p < npoints; p++)
{
if (point_dist.top().second > dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints])
point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
if (point_dist.size() > k)
point_dist.pop();
}
for (ptrdiff_t l = 0; l < (ptrdiff_t)k; ++l)
{
closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().first;
dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().second;
point_dist.pop();
}
assert(std::is_sorted(dist_closest_points + (ptrdiff_t)q * (ptrdiff_t)k,
dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t)k));
}
std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e << ")" << std::endl;
}
delete[] dist_matrix;
delete[] points_l2sq;
delete[] queries_l2sq;
if (metric == diskann::Metric::COSINE)
{
delete[] points;
delete[] queries;
}
}
template <typename T> inline int get_num_parts(const char *filename)
{
std::ifstream reader;
reader.exceptions(std::ios::failbit | std::ios::badbit);
reader.open(filename, std::ios::binary);
std::cout << "Reading bin file " << filename << " ...\n";
int npts_i32, ndims_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&ndims_i32, sizeof(int));
std::cout << "#pts = " << npts_i32 << ", #dims = " << ndims_i32 << std::endl;
reader.close();
uint32_t num_parts =
(npts_i32 % PARTSIZE) == 0 ? npts_i32 / PARTSIZE : (uint32_t)std::floor(npts_i32 / PARTSIZE) + 1;
std::cout << "Number of parts: " << num_parts << std::endl;
return num_parts;
}
template <typename T>
inline void load_bin_as_float(const char *filename, float *&data, size_t &npts, size_t &ndims, int part_num)
{
std::ifstream reader;
reader.exceptions(std::ios::failbit | std::ios::badbit);
reader.open(filename, std::ios::binary);
std::cout << "Reading bin file " << filename << " ...\n";
int npts_i32, ndims_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&ndims_i32, sizeof(int));
uint64_t start_id = part_num * PARTSIZE;
uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
npts = end_id - start_id;
ndims = (uint64_t)ndims_i32;
std::cout << "#pts in part = " << npts << ", #dims = " << ndims << ", size = " << npts * ndims * sizeof(T) << "B"
<< std::endl;
reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
T *data_T = new T[npts * ndims];
reader.read((char *)data_T, sizeof(T) * npts * ndims);
std::cout << "Finished reading part of the bin file." << std::endl;
reader.close();
data = aligned_malloc<float>(npts * ndims, ALIGNMENT);
#pragma omp parallel for schedule(dynamic, 32768)
for (int64_t i = 0; i < (int64_t)npts; i++)
{
for (int64_t j = 0; j < (int64_t)ndims; j++)
{
float cur_val_float = (float)data_T[i * ndims + j];
std::memcpy((char *)(data + i * ndims + j), (char *)&cur_val_float, sizeof(float));
}
}
delete[] data_T;
std::cout << "Finished converting part data to float." << std::endl;
}
template <typename T> inline void save_bin(const std::string filename, T *data, size_t npts, size_t ndims)
{
std::ofstream writer;
writer.exceptions(std::ios::failbit | std::ios::badbit);
writer.open(filename, std::ios::binary | std::ios::out);
std::cout << "Writing bin: " << filename << "\n";
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
writer.write((char *)&npts_i32, sizeof(int));
writer.write((char *)&ndims_i32, sizeof(int));
std::cout << "bin: #pts = " << npts << ", #dims = " << ndims
<< ", size = " << npts * ndims * sizeof(T) + 2 * sizeof(int) << "B" << std::endl;
writer.write((char *)data, npts * ndims * sizeof(T));
writer.close();
std::cout << "Finished writing bin" << std::endl;
}
inline void save_groundtruth_as_one_file(const std::string filename, int32_t *data, float *distances, size_t npts,
size_t ndims)
{
std::ofstream writer(filename, std::ios::binary | std::ios::out);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
writer.write((char *)&npts_i32, sizeof(int));
writer.write((char *)&ndims_i32, sizeof(int));
std::cout << "Saving truthset in one file (npts, dim, npts*dim id-matrix, "
"npts*dim dist-matrix) with npts = "
<< npts << ", dim = " << ndims << ", size = " << 2 * npts * ndims * sizeof(uint32_t) + 2 * sizeof(int)
<< "B" << std::endl;
writer.write((char *)data, npts * ndims * sizeof(uint32_t));
writer.write((char *)distances, npts * ndims * sizeof(float));
writer.close();
std::cout << "Finished writing truthset" << std::endl;
}
template <typename T>
std::vector<std::vector<std::pair<uint32_t, float>>> processUnfilteredParts(const std::string &base_file,
size_t &nqueries, size_t &npoints,
size_t &dim, size_t &k, float *query_data,
const diskann::Metric &metric,
std::vector<uint32_t> &location_to_tag)
{
float *base_data = nullptr;
int num_parts = get_num_parts<T>(base_file.c_str());
std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
for (int p = 0; p < num_parts; p++)
{
size_t start_id = p * PARTSIZE;
load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
size_t *closest_points_part = new size_t[nqueries * k];
float *dist_closest_points_part = new float[nqueries * k];
auto part_k = k < npoints ? k : npoints;
exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints, base_data, nqueries, query_data,
metric);
for (size_t i = 0; i < nqueries; i++)
{
for (size_t j = 0; j < part_k; j++)
{
if (!location_to_tag.empty())
if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
continue;
res[i].push_back(std::make_pair((uint32_t)(closest_points_part[i * part_k + j] + start_id),
dist_closest_points_part[i * part_k + j]));
}
}
delete[] closest_points_part;
delete[] dist_closest_points_part;
diskann::aligned_free(base_data);
}
return res;
};
template <typename T>
int aux_main(const std::string &base_file, const std::string &query_file, const std::string &gt_file, size_t k,
const diskann::Metric &metric, const std::string &tags_file = std::string(""))
{
size_t npoints, nqueries, dim;
float *query_data;
load_bin_as_float<T>(query_file.c_str(), query_data, nqueries, dim, 0);
if (nqueries > PARTSIZE)
std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
<< ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
// load tags
const bool tags_enabled = tags_file.empty() ? false : true;
std::vector<uint32_t> location_to_tag = diskann::loadTags(tags_file, base_file);
int *closest_points = new int[nqueries * k];
float *dist_closest_points = new float[nqueries * k];
std::vector<std::vector<std::pair<uint32_t, float>>> results =
processUnfilteredParts<T>(base_file, nqueries, npoints, dim, k, query_data, metric, location_to_tag);
for (size_t i = 0; i < nqueries; i++)
{
std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
std::sort(cur_res.begin(), cur_res.end(), custom_dist);
size_t j = 0;
for (auto iter : cur_res)
{
if (j == k)
break;
if (tags_enabled)
{
std::uint32_t index_with_tag = location_to_tag[iter.first];
closest_points[i * k + j] = (int32_t)index_with_tag;
}
else
{
closest_points[i * k + j] = (int32_t)iter.first;
}
if (metric == diskann::Metric::INNER_PRODUCT)
dist_closest_points[i * k + j] = -iter.second;
else
dist_closest_points[i * k + j] = iter.second;
++j;
}
if (j < k)
std::cout << "WARNING: found less than k GT entries for query " << i << std::endl;
}
save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k);
delete[] closest_points;
delete[] dist_closest_points;
diskann::aligned_free(query_data);
return 0;
}
void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
{
size_t read_blk_size = 64 * 1024 * 1024;
cached_ifstream reader(bin_file, read_blk_size);
diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
size_t actual_file_size = reader.get_file_size();
int npts_i32, dim_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&dim_i32, sizeof(int));
npts = (uint32_t)npts_i32;
dim = (uint32_t)dim_i32;
diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
// only ids, -1 is error
size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
if (actual_file_size == expected_file_size_with_dists)
truthset_type = 1;
size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
if (actual_file_size == expected_file_size_just_ids)
truthset_type = 2;
if (truthset_type == -1)
{
std::stringstream stream;
stream << "Error. File size mismatch. File should have bin format, with "
"npts followed by ngt followed by npts*ngt ids and optionally "
"followed by npts*ngt distance values; actual size: "
<< actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
<< expected_file_size_just_ids;
diskann::cout << stream.str();
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
}
ids = new uint32_t[npts * dim];
reader.read((char *)ids, npts * dim * sizeof(uint32_t));
if (truthset_type == 1)
{
dists = new float[npts * dim];
reader.read((char *)dists, npts * dim * sizeof(float));
}
}
int main(int argc, char **argv)
{
std::string data_type, dist_fn, base_file, query_file, gt_file, tags_file;
uint64_t K;
try
{
po::options_description desc{"Arguments"};
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
"distance function <l2/mips/cosine>");
desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
"File containing the base vectors in binary format");
desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
"File containing the query vectors in binary format");
desc.add_options()("gt_file", po::value<std::string>(&gt_file)->required(),
"File name for the writing ground truth in binary "
"format, please don' append .bin at end if "
"no filter_label or filter_label_file is provided it "
"will save the file with '.bin' at end."
"else it will save the file as filename_label.bin");
desc.add_options()("K", po::value<uint64_t>(&K)->required(),
"Number of ground truth nearest neighbors to compute");
desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
"File containing the tags in binary format");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}
if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
{
std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
return -1;
}
diskann::Metric metric;
if (dist_fn == std::string("l2"))
{
metric = diskann::Metric::L2;
}
else if (dist_fn == std::string("mips"))
{
metric = diskann::Metric::INNER_PRODUCT;
}
else if (dist_fn == std::string("cosine"))
{
metric = diskann::Metric::COSINE;
}
else
{
std::cerr << "Unsupported distance function. Use l2/mips/cosine." << std::endl;
return -1;
}
try
{
if (data_type == std::string("float"))
aux_main<float>(base_file, query_file, gt_file, K, metric, tags_file);
if (data_type == std::string("int8"))
aux_main<int8_t>(base_file, query_file, gt_file, K, metric, tags_file);
if (data_type == std::string("uint8"))
aux_main<uint8_t>(base_file, query_file, gt_file, K, metric, tags_file);
}
catch (const std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Compute GT failed." << std::endl;
return -1;
}
}

View File

@@ -0,0 +1,919 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <string>
#include <iostream>
#include <fstream>
#include <cassert>
#include <vector>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <random>
#include <limits>
#include <cstring>
#include <queue>
#include <omp.h>
#include <mkl.h>
#include <boost/program_options.hpp>
#include <unordered_map>
#include <tsl/robin_map.h>
#include <tsl/robin_set.h>
#ifdef _WINDOWS
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include "filter_utils.h"
#include "utils.h"
// WORKS FOR UPTO 2 BILLION POINTS (as we use INT INSTEAD OF UNSIGNED)
#define PARTSIZE 10000000
#define ALIGNMENT 512
// custom types (for readability)
typedef tsl::robin_set<std::string> label_set;
typedef std::string path;
namespace po = boost::program_options;
template <class T> T div_round_up(const T numerator, const T denominator)
{
return (numerator % denominator == 0) ? (numerator / denominator) : 1 + (numerator / denominator);
}
using pairIF = std::pair<size_t, float>;
struct cmpmaxstruct
{
bool operator()(const pairIF &l, const pairIF &r)
{
return l.second < r.second;
};
};
using maxPQIFCS = std::priority_queue<pairIF, std::vector<pairIF>, cmpmaxstruct>;
template <class T> T *aligned_malloc(const size_t n, const size_t alignment)
{
#ifdef _WINDOWS
return (T *)_aligned_malloc(sizeof(T) * n, alignment);
#else
return static_cast<T *>(aligned_alloc(alignment, sizeof(T) * n));
#endif
}
inline bool custom_dist(const std::pair<uint32_t, float> &a, const std::pair<uint32_t, float> &b)
{
return a.second < b.second;
}
void compute_l2sq(float *const points_l2sq, const float *const matrix, const int64_t num_points, const uint64_t dim)
{
assert(points_l2sq != NULL);
#pragma omp parallel for schedule(static, 65536)
for (int64_t d = 0; d < num_points; ++d)
points_l2sq[d] = cblas_sdot((int64_t)dim, matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1,
matrix + (ptrdiff_t)d * (ptrdiff_t)dim, 1);
}
void distsq_to_points(const size_t dim,
float *dist_matrix, // Col Major, cols are queries, rows are points
size_t npoints, const float *const points,
const float *const points_l2sq, // points in Col major
size_t nqueries, const float *const queries,
const float *const queries_l2sq, // queries in Col major
float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
{
bool ones_vec_alloc = false;
if (ones_vec == NULL)
{
ones_vec = new float[nqueries > npoints ? nqueries : npoints];
std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
ones_vec_alloc = true;
}
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-2.0, points, dim, queries, dim,
(float)0.0, dist_matrix, npoints);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, points_l2sq, npoints,
ones_vec, nqueries, (float)1.0, dist_matrix, npoints);
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, npoints, nqueries, 1, (float)1.0, ones_vec, npoints,
queries_l2sq, nqueries, (float)1.0, dist_matrix, npoints);
if (ones_vec_alloc)
delete[] ones_vec;
}
void inner_prod_to_points(const size_t dim,
float *dist_matrix, // Col Major, cols are queries, rows are points
size_t npoints, const float *const points, size_t nqueries, const float *const queries,
float *ones_vec = NULL) // Scratchspace of num_data size and init to 1.0
{
bool ones_vec_alloc = false;
if (ones_vec == NULL)
{
ones_vec = new float[nqueries > npoints ? nqueries : npoints];
std::fill_n(ones_vec, nqueries > npoints ? nqueries : npoints, (float)1.0);
ones_vec_alloc = true;
}
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, npoints, nqueries, dim, (float)-1.0, points, dim, queries, dim,
(float)0.0, dist_matrix, npoints);
if (ones_vec_alloc)
delete[] ones_vec;
}
void exact_knn(const size_t dim, const size_t k,
size_t *const closest_points, // k * num_queries preallocated, col
// major, queries columns
float *const dist_closest_points, // k * num_queries
// preallocated, Dist to
// corresponding closes_points
size_t npoints,
float *points_in, // points in Col major
size_t nqueries, float *queries_in,
diskann::Metric metric = diskann::Metric::L2) // queries in Col major
{
float *points_l2sq = new float[npoints];
float *queries_l2sq = new float[nqueries];
compute_l2sq(points_l2sq, points_in, npoints, dim);
compute_l2sq(queries_l2sq, queries_in, nqueries, dim);
float *points = points_in;
float *queries = queries_in;
if (metric == diskann::Metric::COSINE)
{ // we convert cosine distance as
// normalized L2 distnace
points = new float[npoints * dim];
queries = new float[nqueries * dim];
#pragma omp parallel for schedule(static, 4096)
for (int64_t i = 0; i < (int64_t)npoints; i++)
{
float norm = std::sqrt(points_l2sq[i]);
if (norm == 0)
{
norm = std::numeric_limits<float>::epsilon();
}
for (uint32_t j = 0; j < dim; j++)
{
points[i * dim + j] = points_in[i * dim + j] / norm;
}
}
#pragma omp parallel for schedule(static, 4096)
for (int64_t i = 0; i < (int64_t)nqueries; i++)
{
float norm = std::sqrt(queries_l2sq[i]);
if (norm == 0)
{
norm = std::numeric_limits<float>::epsilon();
}
for (uint32_t j = 0; j < dim; j++)
{
queries[i * dim + j] = queries_in[i * dim + j] / norm;
}
}
// recalculate norms after normalizing, they should all be one.
compute_l2sq(points_l2sq, points, npoints, dim);
compute_l2sq(queries_l2sq, queries, nqueries, dim);
}
std::cout << "Going to compute " << k << " NNs for " << nqueries << " queries over " << npoints << " points in "
<< dim << " dimensions using";
if (metric == diskann::Metric::INNER_PRODUCT)
std::cout << " MIPS ";
else if (metric == diskann::Metric::COSINE)
std::cout << " Cosine ";
else
std::cout << " L2 ";
std::cout << "distance fn. " << std::endl;
size_t q_batch_size = (1 << 9);
float *dist_matrix = new float[(size_t)q_batch_size * (size_t)npoints];
for (uint64_t b = 0; b < div_round_up(nqueries, q_batch_size); ++b)
{
int64_t q_b = b * q_batch_size;
int64_t q_e = ((b + 1) * q_batch_size > nqueries) ? nqueries : (b + 1) * q_batch_size;
if (metric == diskann::Metric::L2 || metric == diskann::Metric::COSINE)
{
distsq_to_points(dim, dist_matrix, npoints, points, points_l2sq, q_e - q_b,
queries + (ptrdiff_t)q_b * (ptrdiff_t)dim, queries_l2sq + q_b);
}
else
{
inner_prod_to_points(dim, dist_matrix, npoints, points, q_e - q_b,
queries + (ptrdiff_t)q_b * (ptrdiff_t)dim);
}
std::cout << "Computed distances for queries: [" << q_b << "," << q_e << ")" << std::endl;
#pragma omp parallel for schedule(dynamic, 16)
for (long long q = q_b; q < q_e; q++)
{
maxPQIFCS point_dist;
for (size_t p = 0; p < k; p++)
point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
for (size_t p = k; p < npoints; p++)
{
if (point_dist.top().second > dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints])
point_dist.emplace(p, dist_matrix[(ptrdiff_t)p + (ptrdiff_t)(q - q_b) * (ptrdiff_t)npoints]);
if (point_dist.size() > k)
point_dist.pop();
}
for (ptrdiff_t l = 0; l < (ptrdiff_t)k; ++l)
{
closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().first;
dist_closest_points[(ptrdiff_t)(k - 1 - l) + (ptrdiff_t)q * (ptrdiff_t)k] = point_dist.top().second;
point_dist.pop();
}
assert(std::is_sorted(dist_closest_points + (ptrdiff_t)q * (ptrdiff_t)k,
dist_closest_points + (ptrdiff_t)(q + 1) * (ptrdiff_t)k));
}
std::cout << "Computed exact k-NN for queries: [" << q_b << "," << q_e << ")" << std::endl;
}
delete[] dist_matrix;
delete[] points_l2sq;
delete[] queries_l2sq;
if (metric == diskann::Metric::COSINE)
{
delete[] points;
delete[] queries;
}
}
template <typename T> inline int get_num_parts(const char *filename)
{
std::ifstream reader;
reader.exceptions(std::ios::failbit | std::ios::badbit);
reader.open(filename, std::ios::binary);
std::cout << "Reading bin file " << filename << " ...\n";
int npts_i32, ndims_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&ndims_i32, sizeof(int));
std::cout << "#pts = " << npts_i32 << ", #dims = " << ndims_i32 << std::endl;
reader.close();
int num_parts = (npts_i32 % PARTSIZE) == 0 ? npts_i32 / PARTSIZE : (uint32_t)std::floor(npts_i32 / PARTSIZE) + 1;
std::cout << "Number of parts: " << num_parts << std::endl;
return num_parts;
}
template <typename T>
inline void load_bin_as_float(const char *filename, float *&data, size_t &npts_u64, size_t &ndims_u64, int part_num)
{
std::ifstream reader;
reader.exceptions(std::ios::failbit | std::ios::badbit);
reader.open(filename, std::ios::binary);
std::cout << "Reading bin file " << filename << " ...\n";
int npts_i32, ndims_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&ndims_i32, sizeof(int));
uint64_t start_id = part_num * PARTSIZE;
uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
npts_u64 = end_id - start_id;
ndims_u64 = (uint64_t)ndims_i32;
std::cout << "#pts in part = " << npts_u64 << ", #dims = " << ndims_u64
<< ", size = " << npts_u64 * ndims_u64 * sizeof(T) << "B" << std::endl;
reader.seekg(start_id * ndims_u64 * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
T *data_T = new T[npts_u64 * ndims_u64];
reader.read((char *)data_T, sizeof(T) * npts_u64 * ndims_u64);
std::cout << "Finished reading part of the bin file." << std::endl;
reader.close();
data = aligned_malloc<float>(npts_u64 * ndims_u64, ALIGNMENT);
#pragma omp parallel for schedule(dynamic, 32768)
for (int64_t i = 0; i < (int64_t)npts_u64; i++)
{
for (int64_t j = 0; j < (int64_t)ndims_u64; j++)
{
float cur_val_float = (float)data_T[i * ndims_u64 + j];
std::memcpy((char *)(data + i * ndims_u64 + j), (char *)&cur_val_float, sizeof(float));
}
}
delete[] data_T;
std::cout << "Finished converting part data to float." << std::endl;
}
template <typename T>
inline std::vector<size_t> load_filtered_bin_as_float(const char *filename, float *&data, size_t &npts, size_t &ndims,
int part_num, const char *label_file,
const std::string &filter_label,
const std::string &universal_label, size_t &npoints_filt,
std::vector<std::vector<std::string>> &pts_to_labels)
{
std::ifstream reader(filename, std::ios::binary);
if (reader.fail())
{
throw diskann::ANNException(std::string("Failed to open file ") + filename, -1);
}
std::cout << "Reading bin file " << filename << " ...\n";
int npts_i32, ndims_i32;
std::vector<size_t> rev_map;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&ndims_i32, sizeof(int));
uint64_t start_id = part_num * PARTSIZE;
uint64_t end_id = (std::min)(start_id + PARTSIZE, (uint64_t)npts_i32);
npts = end_id - start_id;
ndims = (uint32_t)ndims_i32;
uint64_t nptsuint64_t = (uint64_t)npts;
uint64_t ndimsuint64_t = (uint64_t)ndims;
npoints_filt = 0;
std::cout << "#pts in part = " << npts << ", #dims = " << ndims
<< ", size = " << nptsuint64_t * ndimsuint64_t * sizeof(T) << "B" << std::endl;
std::cout << "start and end ids: " << start_id << ", " << end_id << std::endl;
reader.seekg(start_id * ndims * sizeof(T) + 2 * sizeof(uint32_t), std::ios::beg);
T *data_T = new T[nptsuint64_t * ndimsuint64_t];
reader.read((char *)data_T, sizeof(T) * nptsuint64_t * ndimsuint64_t);
std::cout << "Finished reading part of the bin file." << std::endl;
reader.close();
data = aligned_malloc<float>(nptsuint64_t * ndimsuint64_t, ALIGNMENT);
for (int64_t i = 0; i < (int64_t)nptsuint64_t; i++)
{
if (std::find(pts_to_labels[start_id + i].begin(), pts_to_labels[start_id + i].end(), filter_label) !=
pts_to_labels[start_id + i].end() ||
std::find(pts_to_labels[start_id + i].begin(), pts_to_labels[start_id + i].end(), universal_label) !=
pts_to_labels[start_id + i].end())
{
rev_map.push_back(start_id + i);
for (int64_t j = 0; j < (int64_t)ndimsuint64_t; j++)
{
float cur_val_float = (float)data_T[i * ndimsuint64_t + j];
std::memcpy((char *)(data + npoints_filt * ndimsuint64_t + j), (char *)&cur_val_float, sizeof(float));
}
npoints_filt++;
}
}
delete[] data_T;
std::cout << "Finished converting part data to float.. identified " << npoints_filt
<< " points matching the filter." << std::endl;
return rev_map;
}
template <typename T> inline void save_bin(const std::string filename, T *data, size_t npts, size_t ndims)
{
std::ofstream writer;
writer.exceptions(std::ios::failbit | std::ios::badbit);
writer.open(filename, std::ios::binary | std::ios::out);
std::cout << "Writing bin: " << filename << "\n";
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
writer.write((char *)&npts_i32, sizeof(int));
writer.write((char *)&ndims_i32, sizeof(int));
std::cout << "bin: #pts = " << npts << ", #dims = " << ndims
<< ", size = " << npts * ndims * sizeof(T) + 2 * sizeof(int) << "B" << std::endl;
writer.write((char *)data, npts * ndims * sizeof(T));
writer.close();
std::cout << "Finished writing bin" << std::endl;
}
inline void save_groundtruth_as_one_file(const std::string filename, int32_t *data, float *distances, size_t npts,
size_t ndims)
{
std::ofstream writer(filename, std::ios::binary | std::ios::out);
int npts_i32 = (int)npts, ndims_i32 = (int)ndims;
writer.write((char *)&npts_i32, sizeof(int));
writer.write((char *)&ndims_i32, sizeof(int));
std::cout << "Saving truthset in one file (npts, dim, npts*dim id-matrix, "
"npts*dim dist-matrix) with npts = "
<< npts << ", dim = " << ndims << ", size = " << 2 * npts * ndims * sizeof(uint32_t) + 2 * sizeof(int)
<< "B" << std::endl;
writer.write((char *)data, npts * ndims * sizeof(uint32_t));
writer.write((char *)distances, npts * ndims * sizeof(float));
writer.close();
std::cout << "Finished writing truthset" << std::endl;
}
inline void parse_label_file_into_vec(size_t &line_cnt, const std::string &map_file,
std::vector<std::vector<std::string>> &pts_to_labels)
{
std::ifstream infile(map_file);
std::string line, token;
std::set<std::string> labels;
infile.clear();
infile.seekg(0, std::ios::beg);
while (std::getline(infile, line))
{
std::istringstream iss(line);
std::vector<std::string> lbls(0);
getline(iss, token, '\t');
std::istringstream new_iss(token);
while (getline(new_iss, token, ','))
{
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
lbls.push_back(token);
labels.insert(token);
}
std::sort(lbls.begin(), lbls.end());
pts_to_labels.push_back(lbls);
}
std::cout << "Identified " << labels.size() << " distinct label(s), and populated labels for "
<< pts_to_labels.size() << " points" << std::endl;
}
template <typename T>
std::vector<std::vector<std::pair<uint32_t, float>>> processUnfilteredParts(const std::string &base_file,
size_t &nqueries, size_t &npoints,
size_t &dim, size_t &k, float *query_data,
const diskann::Metric &metric,
std::vector<uint32_t> &location_to_tag)
{
float *base_data = nullptr;
int num_parts = get_num_parts<T>(base_file.c_str());
std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
for (int p = 0; p < num_parts; p++)
{
size_t start_id = p * PARTSIZE;
load_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p);
size_t *closest_points_part = new size_t[nqueries * k];
float *dist_closest_points_part = new float[nqueries * k];
auto part_k = k < npoints ? k : npoints;
exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints, base_data, nqueries, query_data,
metric);
for (size_t i = 0; i < nqueries; i++)
{
for (uint64_t j = 0; j < part_k; j++)
{
if (!location_to_tag.empty())
if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
continue;
res[i].push_back(std::make_pair((uint32_t)(closest_points_part[i * part_k + j] + start_id),
dist_closest_points_part[i * part_k + j]));
}
}
delete[] closest_points_part;
delete[] dist_closest_points_part;
diskann::aligned_free(base_data);
}
return res;
};
template <typename T>
std::vector<std::vector<std::pair<uint32_t, float>>> processFilteredParts(
const std::string &base_file, const std::string &label_file, const std::string &filter_label,
const std::string &universal_label, size_t &nqueries, size_t &npoints, size_t &dim, size_t &k, float *query_data,
const diskann::Metric &metric, std::vector<uint32_t> &location_to_tag)
{
size_t npoints_filt = 0;
float *base_data = nullptr;
std::vector<std::vector<std::pair<uint32_t, float>>> res(nqueries);
int num_parts = get_num_parts<T>(base_file.c_str());
std::vector<std::vector<std::string>> pts_to_labels;
if (filter_label != "")
parse_label_file_into_vec(npoints, label_file, pts_to_labels);
for (int p = 0; p < num_parts; p++)
{
size_t start_id = p * PARTSIZE;
std::vector<size_t> rev_map;
if (filter_label != "")
rev_map = load_filtered_bin_as_float<T>(base_file.c_str(), base_data, npoints, dim, p, label_file.c_str(),
filter_label, universal_label, npoints_filt, pts_to_labels);
size_t *closest_points_part = new size_t[nqueries * k];
float *dist_closest_points_part = new float[nqueries * k];
auto part_k = k < npoints_filt ? k : npoints_filt;
if (npoints_filt > 0)
{
exact_knn(dim, part_k, closest_points_part, dist_closest_points_part, npoints_filt, base_data, nqueries,
query_data, metric);
}
for (size_t i = 0; i < nqueries; i++)
{
for (uint64_t j = 0; j < part_k; j++)
{
if (!location_to_tag.empty())
if (location_to_tag[closest_points_part[i * k + j] + start_id] == 0)
continue;
res[i].push_back(std::make_pair((uint32_t)(rev_map[closest_points_part[i * part_k + j]]),
dist_closest_points_part[i * part_k + j]));
}
}
delete[] closest_points_part;
delete[] dist_closest_points_part;
diskann::aligned_free(base_data);
}
return res;
};
template <typename T>
int aux_main(const std::string &base_file, const std::string &label_file, const std::string &query_file,
const std::string &gt_file, size_t k, const std::string &universal_label, const diskann::Metric &metric,
const std::string &filter_label, const std::string &tags_file = std::string(""))
{
size_t npoints, nqueries, dim;
float *query_data = nullptr;
load_bin_as_float<T>(query_file.c_str(), query_data, nqueries, dim, 0);
if (nqueries > PARTSIZE)
std::cerr << "WARNING: #Queries provided (" << nqueries << ") is greater than " << PARTSIZE
<< ". Computing GT only for the first " << PARTSIZE << " queries." << std::endl;
// load tags
const bool tags_enabled = tags_file.empty() ? false : true;
std::vector<uint32_t> location_to_tag = diskann::loadTags(tags_file, base_file);
int *closest_points = new int[nqueries * k];
float *dist_closest_points = new float[nqueries * k];
std::vector<std::vector<std::pair<uint32_t, float>>> results;
if (filter_label == "")
{
results = processUnfilteredParts<T>(base_file, nqueries, npoints, dim, k, query_data, metric, location_to_tag);
}
else
{
results = processFilteredParts<T>(base_file, label_file, filter_label, universal_label, nqueries, npoints, dim,
k, query_data, metric, location_to_tag);
}
for (size_t i = 0; i < nqueries; i++)
{
std::vector<std::pair<uint32_t, float>> &cur_res = results[i];
std::sort(cur_res.begin(), cur_res.end(), custom_dist);
size_t j = 0;
for (auto iter : cur_res)
{
if (j == k)
break;
if (tags_enabled)
{
std::uint32_t index_with_tag = location_to_tag[iter.first];
closest_points[i * k + j] = (int32_t)index_with_tag;
}
else
{
closest_points[i * k + j] = (int32_t)iter.first;
}
if (metric == diskann::Metric::INNER_PRODUCT)
dist_closest_points[i * k + j] = -iter.second;
else
dist_closest_points[i * k + j] = iter.second;
++j;
}
if (j < k)
std::cout << "WARNING: found less than k GT entries for query " << i << std::endl;
}
save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, nqueries, k);
delete[] closest_points;
delete[] dist_closest_points;
diskann::aligned_free(query_data);
return 0;
}
void load_truthset(const std::string &bin_file, uint32_t *&ids, float *&dists, size_t &npts, size_t &dim)
{
size_t read_blk_size = 64 * 1024 * 1024;
cached_ifstream reader(bin_file, read_blk_size);
diskann::cout << "Reading truthset file " << bin_file.c_str() << " ..." << std::endl;
size_t actual_file_size = reader.get_file_size();
int npts_i32, dim_i32;
reader.read((char *)&npts_i32, sizeof(int));
reader.read((char *)&dim_i32, sizeof(int));
npts = (uint32_t)npts_i32;
dim = (uint32_t)dim_i32;
diskann::cout << "Metadata: #pts = " << npts << ", #dims = " << dim << "... " << std::endl;
int truthset_type = -1; // 1 means truthset has ids and distances, 2 means
// only ids, -1 is error
size_t expected_file_size_with_dists = 2 * npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
if (actual_file_size == expected_file_size_with_dists)
truthset_type = 1;
size_t expected_file_size_just_ids = npts * dim * sizeof(uint32_t) + 2 * sizeof(uint32_t);
if (actual_file_size == expected_file_size_just_ids)
truthset_type = 2;
if (truthset_type == -1)
{
std::stringstream stream;
stream << "Error. File size mismatch. File should have bin format, with "
"npts followed by ngt followed by npts*ngt ids and optionally "
"followed by npts*ngt distance values; actual size: "
<< actual_file_size << ", expected: " << expected_file_size_with_dists << " or "
<< expected_file_size_just_ids;
diskann::cout << stream.str();
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__);
}
ids = new uint32_t[npts * dim];
reader.read((char *)ids, npts * dim * sizeof(uint32_t));
if (truthset_type == 1)
{
dists = new float[npts * dim];
reader.read((char *)dists, npts * dim * sizeof(float));
}
}
int main(int argc, char **argv)
{
std::string data_type, dist_fn, base_file, query_file, gt_file, tags_file, label_file, filter_label,
universal_label, filter_label_file;
uint64_t K;
try
{
po::options_description desc{"Arguments"};
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(), "distance function <l2/mips>");
desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
"File containing the base vectors in binary format");
desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
"File containing the query vectors in binary format");
desc.add_options()("label_file", po::value<std::string>(&label_file)->default_value(""),
"Input labels file in txt format if present");
desc.add_options()("filter_label", po::value<std::string>(&filter_label)->default_value(""),
"Input filter label if doing filtered groundtruth");
desc.add_options()("universal_label", po::value<std::string>(&universal_label)->default_value(""),
"Universal label, if using it, only in conjunction with label_file");
desc.add_options()("gt_file", po::value<std::string>(&gt_file)->required(),
"File name for the writing ground truth in binary "
"format, please don' append .bin at end if "
"no filter_label or filter_label_file is provided it "
"will save the file with '.bin' at end."
"else it will save the file as filename_label.bin");
desc.add_options()("K", po::value<uint64_t>(&K)->required(),
"Number of ground truth nearest neighbors to compute");
desc.add_options()("tags_file", po::value<std::string>(&tags_file)->default_value(std::string()),
"File containing the tags in binary format");
desc.add_options()("filter_label_file",
po::value<std::string>(&filter_label_file)->default_value(std::string("")),
"Filter file for Queries for Filtered Search ");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}
if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
{
std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
return -1;
}
if (filter_label != "" && filter_label_file != "")
{
std::cerr << "Only one of filter_label and query_filters_file should be provided" << std::endl;
return -1;
}
diskann::Metric metric;
if (dist_fn == std::string("l2"))
{
metric = diskann::Metric::L2;
}
else if (dist_fn == std::string("mips"))
{
metric = diskann::Metric::INNER_PRODUCT;
}
else if (dist_fn == std::string("cosine"))
{
metric = diskann::Metric::COSINE;
}
else
{
std::cerr << "Unsupported distance function. Use l2/mips/cosine." << std::endl;
return -1;
}
std::vector<std::string> filter_labels;
if (filter_label != "")
{
filter_labels.push_back(filter_label);
}
else if (filter_label_file != "")
{
filter_labels = read_file_to_vector_of_strings(filter_label_file, false);
}
// only when there is no filter label or 1 filter label for all queries
if (filter_labels.size() == 1)
{
try
{
if (data_type == std::string("float"))
aux_main<float>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
filter_labels[0], tags_file);
if (data_type == std::string("int8"))
aux_main<int8_t>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
filter_labels[0], tags_file);
if (data_type == std::string("uint8"))
aux_main<uint8_t>(base_file, label_file, query_file, gt_file, K, universal_label, metric,
filter_labels[0], tags_file);
}
catch (const std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Compute GT failed." << std::endl;
return -1;
}
}
else
{ // Each query has its own filter label
// Split up data and query bins into label specific ones
tsl::robin_map<std::string, uint32_t> labels_to_number_of_points;
tsl::robin_map<std::string, uint32_t> labels_to_number_of_queries;
label_set all_labels;
for (size_t i = 0; i < filter_labels.size(); i++)
{
std::string label = filter_labels[i];
all_labels.insert(label);
if (labels_to_number_of_queries.find(label) == labels_to_number_of_queries.end())
{
labels_to_number_of_queries[label] = 0;
}
labels_to_number_of_queries[label] += 1;
}
size_t npoints;
std::vector<std::vector<std::string>> point_to_labels;
parse_label_file_into_vec(npoints, label_file, point_to_labels);
std::vector<label_set> point_ids_to_labels(point_to_labels.size());
std::vector<label_set> query_ids_to_labels(filter_labels.size());
for (size_t i = 0; i < point_to_labels.size(); i++)
{
for (size_t j = 0; j < point_to_labels[i].size(); j++)
{
std::string label = point_to_labels[i][j];
if (all_labels.find(label) != all_labels.end())
{
point_ids_to_labels[i].insert(point_to_labels[i][j]);
if (labels_to_number_of_points.find(label) == labels_to_number_of_points.end())
{
labels_to_number_of_points[label] = 0;
}
labels_to_number_of_points[label] += 1;
}
}
}
for (size_t i = 0; i < filter_labels.size(); i++)
{
query_ids_to_labels[i].insert(filter_labels[i]);
}
tsl::robin_map<std::string, std::vector<uint32_t>> label_id_to_orig_id;
tsl::robin_map<std::string, std::vector<uint32_t>> label_query_id_to_orig_id;
if (data_type == std::string("float"))
{
label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<float>(
base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<float>(
query_file, labels_to_number_of_queries, query_ids_to_labels,
all_labels); // query_filters acts like query_ids_to_labels
}
else if (data_type == std::string("int8"))
{
label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<int8_t>(
base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<int8_t>(
query_file, labels_to_number_of_queries, query_ids_to_labels,
all_labels); // query_filters acts like query_ids_to_labels
}
else if (data_type == std::string("uint8"))
{
label_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<uint8_t>(
base_file, labels_to_number_of_points, point_ids_to_labels, all_labels);
label_query_id_to_orig_id = diskann::generate_label_specific_vector_files_compat<uint8_t>(
query_file, labels_to_number_of_queries, query_ids_to_labels,
all_labels); // query_filters acts like query_ids_to_labels
}
else
{
diskann::cerr << "Invalid data type" << std::endl;
return -1;
}
// Generate label specific ground truths
try
{
for (const auto &label : all_labels)
{
std::string filtered_base_file = base_file + "_" + label;
std::string filtered_query_file = query_file + "_" + label;
std::string filtered_gt_file = gt_file + "_" + label;
if (data_type == std::string("float"))
aux_main<float>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
if (data_type == std::string("int8"))
aux_main<int8_t>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
if (data_type == std::string("uint8"))
aux_main<uint8_t>(filtered_base_file, "", filtered_query_file, filtered_gt_file, K, "", metric, "");
}
}
catch (const std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Compute GT failed." << std::endl;
return -1;
}
// Combine the label specific ground truths to produce a single GT file
uint32_t *gt_ids = nullptr;
float *gt_dists = nullptr;
size_t gt_num, gt_dim;
std::vector<std::vector<int32_t>> final_gt_ids;
std::vector<std::vector<float>> final_gt_dists;
uint32_t query_num = 0;
for (const auto &lbl : all_labels)
{
query_num += labels_to_number_of_queries[lbl];
}
for (uint32_t i = 0; i < query_num; i++)
{
final_gt_ids.push_back(std::vector<int32_t>(K));
final_gt_dists.push_back(std::vector<float>(K));
}
for (const auto &lbl : all_labels)
{
std::string filtered_gt_file = gt_file + "_" + lbl;
load_truthset(filtered_gt_file, gt_ids, gt_dists, gt_num, gt_dim);
for (uint32_t i = 0; i < labels_to_number_of_queries[lbl]; i++)
{
uint32_t orig_query_id = label_query_id_to_orig_id[lbl][i];
for (uint64_t j = 0; j < K; j++)
{
final_gt_ids[orig_query_id][j] = label_id_to_orig_id[lbl][gt_ids[i * K + j]];
final_gt_dists[orig_query_id][j] = gt_dists[i * K + j];
}
}
}
int32_t *closest_points = new int32_t[query_num * K];
float *dist_closest_points = new float[query_num * K];
for (uint32_t i = 0; i < query_num; i++)
{
for (uint32_t j = 0; j < K; j++)
{
closest_points[i * K + j] = final_gt_ids[i][j];
dist_closest_points[i * K + j] = final_gt_dists[i][j];
}
}
save_groundtruth_as_one_file(gt_file, closest_points, dist_closest_points, query_num, K);
// cleanup artifacts
std::cout << "Cleaning up artifacts..." << std::endl;
tsl::robin_set<std::string> paths_to_clean{gt_file, base_file, query_file};
clean_up_artifacts(paths_to_clean, all_labels);
}
}

View File

@@ -0,0 +1,82 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <cstring>
#include <iomanip>
#include <algorithm>
#include <numeric>
#include <omp.h>
#include <set>
#include <string.h>
#include <boost/program_options.hpp>
#ifndef _WINDOWS
#include <sys/mman.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
#endif
#include "utils.h"
#include "index.h"
#include "memory_mapper.h"
namespace po = boost::program_options;
template <typename T> void bfs_count(const std::string &index_path, uint32_t data_dims)
{
using TagT = uint32_t;
using LabelT = uint32_t;
diskann::Index<T, TagT, LabelT> index(diskann::Metric::L2, data_dims, 0, nullptr, nullptr, 0, false, false, false,
false, 0, false);
std::cout << "Index class instantiated" << std::endl;
index.load(index_path.c_str(), 1, 100);
std::cout << "Index loaded" << std::endl;
index.count_nodes_at_bfs_levels();
}
int main(int argc, char **argv)
{
std::string data_type, index_path_prefix;
uint32_t data_dims;
po::options_description desc{"Arguments"};
try
{
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
desc.add_options()("index_path_prefix", po::value<std::string>(&index_path_prefix)->required(),
"Path prefix to the index");
desc.add_options()("data_dims", po::value<uint32_t>(&data_dims)->required(), "Dimensionality of the data");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}
try
{
if (data_type == std::string("int8"))
bfs_count<int8_t>(index_path_prefix, data_dims);
else if (data_type == std::string("uint8"))
bfs_count<uint8_t>(index_path_prefix, data_dims);
if (data_type == std::string("float"))
bfs_count<float>(index_path_prefix, data_dims);
}
catch (std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Index BFS failed." << std::endl;
return -1;
}
}

View File

@@ -0,0 +1,48 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <vector>
#include "utils.h"
#include "disk_utils.h"
#include "cached_io.h"
template <typename T> int create_disk_layout(char **argv)
{
std::string base_file(argv[2]);
std::string vamana_file(argv[3]);
std::string output_file(argv[4]);
diskann::create_disk_layout<T>(base_file, vamana_file, output_file);
return 0;
}
int main(int argc, char **argv)
{
if (argc != 5)
{
std::cout << argv[0]
<< " data_type <float/int8/uint8> data_bin "
"vamana_index_file output_diskann_index_file"
<< std::endl;
exit(-1);
}
int ret_val = -1;
if (std::string(argv[1]) == std::string("float"))
ret_val = create_disk_layout<float>(argv);
else if (std::string(argv[1]) == std::string("int8"))
ret_val = create_disk_layout<int8_t>(argv);
else if (std::string(argv[1]) == std::string("uint8"))
ret_val = create_disk_layout<uint8_t>(argv);
else
{
std::cout << "unsupported type. use int8/uint8/float " << std::endl;
ret_val = -2;
}
return ret_val;
}

View File

@@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
void block_convert(std::ofstream &writer, int8_t *write_buf, std::ifstream &reader, float *read_buf, size_t npts,
size_t ndims, float bias, float scale)
{
reader.read((char *)read_buf, npts * ndims * sizeof(float));
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; d++)
{
write_buf[d + i * ndims] = (int8_t)((read_buf[d + i * ndims] - bias) * (254.0 / scale));
}
}
writer.write((char *)write_buf, npts * ndims);
}
int main(int argc, char **argv)
{
if (argc != 5)
{
std::cout << "Usage: " << argv[0] << " input_bin output_tsv bias scale" << std::endl;
exit(-1);
}
std::ifstream reader(argv[1], std::ios::binary);
uint32_t npts_u32;
uint32_t ndims_u32;
reader.read((char *)&npts_u32, sizeof(uint32_t));
reader.read((char *)&ndims_u32, sizeof(uint32_t));
size_t npts = npts_u32;
size_t ndims = ndims_u32;
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::ofstream writer(argv[2], std::ios::binary);
auto read_buf = new float[blk_size * ndims];
auto write_buf = new int8_t[blk_size * ndims];
float bias = (float)atof(argv[3]);
float scale = (float)atof(argv[4]);
writer.write((char *)(&npts_u32), sizeof(uint32_t));
writer.write((char *)(&ndims_u32), sizeof(uint32_t));
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
writer.close();
reader.close();
}

View File

@@ -0,0 +1,95 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
// Convert float types
void block_convert_float(std::ifstream &reader, std::ofstream &writer, float *read_buf, float *write_buf, size_t npts,
size_t ndims)
{
reader.read((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(uint32_t)));
for (size_t i = 0; i < npts; i++)
{
memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(float));
}
writer.write((char *)write_buf, npts * ndims * sizeof(float));
}
// Convert byte types
void block_convert_byte(std::ifstream &reader, std::ofstream &writer, uint8_t *read_buf, uint8_t *write_buf,
size_t npts, size_t ndims)
{
reader.read((char *)read_buf, npts * (ndims * sizeof(uint8_t) + sizeof(uint32_t)));
for (size_t i = 0; i < npts; i++)
{
memcpy(write_buf + i * ndims, (read_buf + i * (ndims + sizeof(uint32_t))) + sizeof(uint32_t),
ndims * sizeof(uint8_t));
}
writer.write((char *)write_buf, npts * ndims * sizeof(uint8_t));
}
int main(int argc, char **argv)
{
if (argc != 4)
{
std::cout << argv[0] << " <float/int8/uint8> input_vecs output_bin" << std::endl;
exit(-1);
}
int datasize = sizeof(float);
if (strcmp(argv[1], "uint8") == 0 || strcmp(argv[1], "int8") == 0)
{
datasize = sizeof(uint8_t);
}
else if (strcmp(argv[1], "float") != 0)
{
std::cout << "Error: type not supported. Use float/int8/uint8" << std::endl;
exit(-1);
}
std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
size_t fsize = reader.tellg();
reader.seekg(0, std::ios::beg);
uint32_t ndims_u32;
reader.read((char *)&ndims_u32, sizeof(uint32_t));
reader.seekg(0, std::ios::beg);
size_t ndims = (size_t)ndims_u32;
size_t npts = fsize / ((ndims * datasize) + sizeof(uint32_t));
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writer(argv[3], std::ios::binary);
int32_t npts_s32 = (int32_t)npts;
int32_t ndims_s32 = (int32_t)ndims;
writer.write((char *)&npts_s32, sizeof(int32_t));
writer.write((char *)&ndims_s32, sizeof(int32_t));
size_t chunknpts = std::min(npts, blk_size);
uint8_t *read_buf = new uint8_t[chunknpts * ((ndims * datasize) + sizeof(uint32_t))];
uint8_t *write_buf = new uint8_t[chunknpts * ndims * datasize];
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (datasize == sizeof(float))
{
block_convert_float(reader, writer, (float *)read_buf, (float *)write_buf, cblk_size, ndims);
}
else
{
block_convert_byte(reader, writer, read_buf, write_buf, cblk_size, ndims);
}
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
reader.close();
writer.close();
}

View File

@@ -0,0 +1,56 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
void block_convert(std::ifstream &reader, std::ofstream &writer, float *read_buf, uint8_t *write_buf, size_t npts,
size_t ndims)
{
reader.read((char *)read_buf, npts * (ndims * sizeof(float) + sizeof(uint32_t)));
for (size_t i = 0; i < npts; i++)
{
memcpy(write_buf + i * (ndims + 4), read_buf + i * (ndims + 1), sizeof(uint32_t));
for (size_t d = 0; d < ndims; d++)
write_buf[i * (ndims + 4) + 4 + d] = (uint8_t)read_buf[i * (ndims + 1) + 1 + d];
}
writer.write((char *)write_buf, npts * (ndims * 1 + 4));
}
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_fvecs output_bvecs(uint8)" << std::endl;
exit(-1);
}
std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
size_t fsize = reader.tellg();
reader.seekg(0, std::ios::beg);
uint32_t ndims_u32;
reader.read((char *)&ndims_u32, sizeof(uint32_t));
reader.seekg(0, std::ios::beg);
size_t ndims = (size_t)ndims_u32;
size_t npts = fsize / ((ndims + 1) * sizeof(float));
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writer(argv[2], std::ios::binary);
auto read_buf = new float[npts * (ndims + 1)];
auto write_buf = new uint8_t[npts * (ndims + 4)];
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
reader.close();
writer.close();
}

View File

@@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <omp.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <ctime>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <string>
#include "partition.h"
#include "utils.h"
#include <fcntl.h>
#include <sys/stat.h>
#include <time.h>
#include <typeinfo>
template <typename T> int aux_main(char **argv)
{
std::string base_file(argv[2]);
std::string output_prefix(argv[3]);
float sampling_rate = (float)(std::atof(argv[4]));
gen_random_slice<T>(base_file, output_prefix, sampling_rate);
return 0;
}
int main(int argc, char **argv)
{
if (argc != 5)
{
std::cout << argv[0]
<< " data_type [float/int8/uint8] base_bin_file "
"sample_output_prefix sampling_probability"
<< std::endl;
exit(-1);
}
if (std::string(argv[1]) == std::string("float"))
{
aux_main<float>(argv);
}
else if (std::string(argv[1]) == std::string("int8"))
{
aux_main<int8_t>(argv);
}
else if (std::string(argv[1]) == std::string("uint8"))
{
aux_main<uint8_t>(argv);
}
else
std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
return 0;
}

View File

@@ -0,0 +1,70 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include "math_utils.h"
#include "pq.h"
#include "partition.h"
#define KMEANS_ITERS_FOR_PQ 15
template <typename T>
bool generate_pq(const std::string &data_path, const std::string &index_prefix_path, const size_t num_pq_centers,
const size_t num_pq_chunks, const float sampling_rate, const bool opq)
{
std::string pq_pivots_path = index_prefix_path + "_pq_pivots.bin";
std::string pq_compressed_vectors_path = index_prefix_path + "_pq_compressed.bin";
// generates random sample and sets it to train_data and updates train_size
size_t train_size, train_dim;
float *train_data;
gen_random_slice<T>(data_path, sampling_rate, train_data, train_size, train_dim);
std::cout << "For computing pivots, loaded sample data of size " << train_size << std::endl;
if (opq)
{
diskann::generate_opq_pivots(train_data, train_size, (uint32_t)train_dim, (uint32_t)num_pq_centers,
(uint32_t)num_pq_chunks, pq_pivots_path, true);
}
else
{
diskann::generate_pq_pivots(train_data, train_size, (uint32_t)train_dim, (uint32_t)num_pq_centers,
(uint32_t)num_pq_chunks, KMEANS_ITERS_FOR_PQ, pq_pivots_path);
}
diskann::generate_pq_data_from_pivots<T>(data_path, (uint32_t)num_pq_centers, (uint32_t)num_pq_chunks,
pq_pivots_path, pq_compressed_vectors_path, true);
delete[] train_data;
return 0;
}
int main(int argc, char **argv)
{
if (argc != 7)
{
std::cout << "Usage: \n"
<< argv[0]
<< " <data_type[float/uint8/int8]> <data_file[.bin]>"
" <PQ_prefix_path> <target-bytes/data-point> "
"<sampling_rate> <PQ(0)/OPQ(1)>"
<< std::endl;
}
else
{
const std::string data_path(argv[2]);
const std::string index_prefix_path(argv[3]);
const size_t num_pq_centers = 256;
const size_t num_pq_chunks = (size_t)atoi(argv[4]);
const float sampling_rate = (float)atof(argv[5]);
const bool opq = atoi(argv[6]) == 0 ? false : true;
if (std::string(argv[1]) == std::string("float"))
generate_pq<float>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
else if (std::string(argv[1]) == std::string("int8"))
generate_pq<int8_t>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
else if (std::string(argv[1]) == std::string("uint8"))
generate_pq<uint8_t>(data_path, index_prefix_path, num_pq_centers, num_pq_chunks, sampling_rate, opq);
else
std::cout << "Error. wrong file type" << std::endl;
}
}

View File

@@ -0,0 +1,204 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include <random>
#include <boost/program_options.hpp>
#include <math.h>
#include <cmath>
#include "utils.h"
namespace po = boost::program_options;
class ZipfDistribution
{
public:
ZipfDistribution(uint64_t num_points, uint32_t num_labels)
: num_labels(num_labels), num_points(num_points),
uniform_zero_to_one(std::uniform_real_distribution<>(0.0, 1.0))
{
}
std::unordered_map<uint32_t, uint32_t> createDistributionMap()
{
std::unordered_map<uint32_t, uint32_t> map;
uint32_t primary_label_freq = (uint32_t)ceil(num_points * distribution_factor);
for (uint32_t i{1}; i < num_labels + 1; i++)
{
map[i] = (uint32_t)ceil(primary_label_freq / i);
}
return map;
}
int writeDistribution(std::ofstream &outfile)
{
auto distribution_map = createDistributionMap();
for (uint32_t i{0}; i < num_points; i++)
{
bool label_written = false;
for (auto it = distribution_map.cbegin(); it != distribution_map.cend(); it++)
{
auto label_selection_probability = std::bernoulli_distribution(distribution_factor / (double)it->first);
if (label_selection_probability(rand_engine) && distribution_map[it->first] > 0)
{
if (label_written)
{
outfile << ',';
}
outfile << it->first;
label_written = true;
// remove label from map if we have used all labels
distribution_map[it->first] -= 1;
}
}
if (!label_written)
{
outfile << 0;
}
if (i < num_points - 1)
{
outfile << '\n';
}
}
return 0;
}
int writeDistribution(std::string filename)
{
std::ofstream outfile(filename);
if (!outfile.is_open())
{
std::cerr << "Error: could not open output file " << filename << '\n';
return -1;
}
writeDistribution(outfile);
outfile.close();
}
private:
const uint32_t num_labels;
const uint64_t num_points;
const double distribution_factor = 0.7;
std::knuth_b rand_engine;
const std::uniform_real_distribution<double> uniform_zero_to_one;
};
int main(int argc, char **argv)
{
std::string output_file, distribution_type;
uint32_t num_labels;
uint64_t num_points;
try
{
po::options_description desc{"Arguments"};
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("output_file,O", po::value<std::string>(&output_file)->required(),
"Filename for saving the label file");
desc.add_options()("num_points,N", po::value<uint64_t>(&num_points)->required(), "Number of points in dataset");
desc.add_options()("num_labels,L", po::value<uint32_t>(&num_labels)->required(),
"Number of unique labels, up to 5000");
desc.add_options()("distribution_type,DT", po::value<std::string>(&distribution_type)->default_value("random"),
"Distribution function for labels <random/zipf/one_per_point> defaults "
"to random");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}
if (num_labels > 5000)
{
std::cerr << "Error: num_labels must be 5000 or less" << '\n';
return -1;
}
if (num_points <= 0)
{
std::cerr << "Error: num_points must be greater than 0" << '\n';
return -1;
}
std::cout << "Generating synthetic labels for " << num_points << " points with " << num_labels << " unique labels"
<< '\n';
try
{
std::ofstream outfile(output_file);
if (!outfile.is_open())
{
std::cerr << "Error: could not open output file " << output_file << '\n';
return -1;
}
if (distribution_type == "zipf")
{
ZipfDistribution zipf(num_points, num_labels);
zipf.writeDistribution(outfile);
}
else if (distribution_type == "random")
{
for (size_t i = 0; i < num_points; i++)
{
bool label_written = false;
for (size_t j = 1; j <= num_labels; j++)
{
// 50% chance to assign each label
if (rand() > (RAND_MAX / 2))
{
if (label_written)
{
outfile << ',';
}
outfile << j;
label_written = true;
}
}
if (!label_written)
{
outfile << 0;
}
if (i < num_points - 1)
{
outfile << '\n';
}
}
}
else if (distribution_type == "one_per_point")
{
std::random_device rd; // obtain a random number from hardware
std::mt19937 gen(rd()); // seed the generator
std::uniform_int_distribution<> distr(0, num_labels); // define the range
for (size_t i = 0; i < num_points; i++)
{
outfile << distr(gen);
if (i != num_points - 1)
outfile << '\n';
}
}
if (outfile.is_open())
{
outfile.close();
}
std::cout << "Labels written to " << output_file << '\n';
}
catch (const std::exception &ex)
{
std::cerr << "Label generation failed: " << ex.what() << '\n';
return -1;
}
return 0;
}

View File

@@ -0,0 +1,23 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_int8_bin output_float_bin" << std::endl;
exit(-1);
}
int8_t *input;
size_t npts, nd;
diskann::load_bin<int8_t>(argv[1], input, npts, nd);
float *output = new float[npts * nd];
diskann::convert_types<int8_t, float>(input, output, npts, nd);
diskann::save_bin<float>(argv[2], output, npts, nd);
delete[] output;
delete[] input;
}

View File

@@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
void block_convert(std::ofstream &writer, float *write_buf, std::ifstream &reader, int8_t *read_buf, size_t npts,
size_t ndims, float bias, float scale)
{
reader.read((char *)read_buf, npts * ndims * sizeof(int8_t));
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; d++)
{
write_buf[d + i * ndims] = (((float)read_buf[d + i * ndims] - bias) * scale);
}
}
writer.write((char *)write_buf, npts * ndims * sizeof(float));
}
int main(int argc, char **argv)
{
if (argc != 5)
{
std::cout << "Usage: " << argv[0] << " input-int8.bin output-float.bin bias scale" << std::endl;
exit(-1);
}
std::ifstream reader(argv[1], std::ios::binary);
uint32_t npts_u32;
uint32_t ndims_u32;
reader.read((char *)&npts_u32, sizeof(uint32_t));
reader.read((char *)&ndims_u32, sizeof(uint32_t));
size_t npts = npts_u32;
size_t ndims = ndims_u32;
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::ofstream writer(argv[2], std::ios::binary);
auto read_buf = new int8_t[blk_size * ndims];
auto write_buf = new float[blk_size * ndims];
float bias = (float)atof(argv[3]);
float scale = (float)atof(argv[4]);
writer.write((char *)(&npts_u32), sizeof(uint32_t));
writer.write((char *)(&ndims_u32), sizeof(uint32_t));
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(writer, write_buf, reader, read_buf, cblk_size, ndims, bias, scale);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
writer.close();
reader.close();
}

View File

@@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
void block_convert(std::ifstream &reader, std::ofstream &writer, uint32_t *read_buf, uint32_t *write_buf, size_t npts,
size_t ndims)
{
reader.read((char *)read_buf, npts * (ndims * sizeof(uint32_t) + sizeof(uint32_t)));
for (size_t i = 0; i < npts; i++)
{
memcpy(write_buf + i * ndims, (read_buf + i * (ndims + 1)) + 1, ndims * sizeof(uint32_t));
}
writer.write((char *)write_buf, npts * ndims * sizeof(uint32_t));
}
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_ivecs output_bin" << std::endl;
exit(-1);
}
std::ifstream reader(argv[1], std::ios::binary | std::ios::ate);
size_t fsize = reader.tellg();
reader.seekg(0, std::ios::beg);
uint32_t ndims_u32;
reader.read((char *)&ndims_u32, sizeof(uint32_t));
reader.seekg(0, std::ios::beg);
size_t ndims = (size_t)ndims_u32;
size_t npts = fsize / ((ndims + 1) * sizeof(uint32_t));
std::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims << std::endl;
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writer(argv[2], std::ios::binary);
int npts_s32 = (int)npts;
int ndims_s32 = (int)ndims;
writer.write((char *)&npts_s32, sizeof(int));
writer.write((char *)&ndims_s32, sizeof(int));
uint32_t *read_buf = new uint32_t[npts * (ndims + 1)];
uint32_t *write_buf = new uint32_t[npts * ndims];
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(reader, writer, read_buf, write_buf, cblk_size, ndims);
std::cout << "Block #" << i << " written" << std::endl;
}
delete[] read_buf;
delete[] write_buf;
reader.close();
writer.close();
}

View File

@@ -0,0 +1,42 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <algorithm>
#include <atomic>
#include <cassert>
#include <fstream>
#include <iostream>
#include <set>
#include <string>
#include <vector>
#include "disk_utils.h"
#include "cached_io.h"
#include "utils.h"
int main(int argc, char **argv)
{
if (argc != 9)
{
std::cout << argv[0]
<< " vamana_index_prefix[1] vamana_index_suffix[2] "
"idmaps_prefix[3] "
"idmaps_suffix[4] n_shards[5] max_degree[6] "
"output_vamana_path[7] "
"output_medoids_path[8]"
<< std::endl;
exit(-1);
}
std::string vamana_prefix(argv[1]);
std::string vamana_suffix(argv[2]);
std::string idmaps_prefix(argv[3]);
std::string idmaps_suffix(argv[4]);
uint64_t nshards = (uint64_t)std::atoi(argv[5]);
uint32_t max_degree = (uint64_t)std::atoi(argv[6]);
std::string output_index(argv[7]);
std::string output_medoids(argv[8]);
return diskann::merge_shards(vamana_prefix, vamana_suffix, idmaps_prefix, idmaps_suffix, nshards, max_degree,
output_index, output_medoids);
}

View File

@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <index.h>
#include <math_utils.h>
#include "cached_io.h"
#include "partition.h"
// DEPRECATED: NEED TO REPROGRAM
int main(int argc, char **argv)
{
if (argc != 7)
{
std::cout << "Usage:\n"
<< argv[0]
<< " datatype<int8/uint8/float> <data_path>"
" <prefix_path> <sampling_rate> "
" <num_partitions> <k_index>"
<< std::endl;
exit(-1);
}
const std::string data_path(argv[2]);
const std::string prefix_path(argv[3]);
const float sampling_rate = (float)atof(argv[4]);
const size_t num_partitions = (size_t)std::atoi(argv[5]);
const size_t max_reps = 15;
const size_t k_index = (size_t)std::atoi(argv[6]);
if (std::string(argv[1]) == std::string("float"))
partition<float>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
else if (std::string(argv[1]) == std::string("int8"))
partition<int8_t>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
else if (std::string(argv[1]) == std::string("uint8"))
partition<uint8_t>(data_path, sampling_rate, num_partitions, max_reps, prefix_path, k_index);
else
std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
}

View File

@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <index.h>
#include <math_utils.h>
#include "cached_io.h"
#include "partition.h"
// DEPRECATED: NEED TO REPROGRAM
int main(int argc, char **argv)
{
if (argc != 8)
{
std::cout << "Usage:\n"
<< argv[0]
<< " datatype<int8/uint8/float> <data_path>"
" <prefix_path> <sampling_rate> "
" <ram_budget(GB)> <graph_degree> <k_index>"
<< std::endl;
exit(-1);
}
const std::string data_path(argv[2]);
const std::string prefix_path(argv[3]);
const float sampling_rate = (float)atof(argv[4]);
const double ram_budget = (double)std::atof(argv[5]);
const size_t graph_degree = (size_t)std::atoi(argv[6]);
const size_t k_index = (size_t)std::atoi(argv[7]);
if (std::string(argv[1]) == std::string("float"))
partition_with_ram_budget<float>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
else if (std::string(argv[1]) == std::string("int8"))
partition_with_ram_budget<int8_t>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
else if (std::string(argv[1]) == std::string("uint8"))
partition_with_ram_budget<uint8_t>(data_path, sampling_rate, ram_budget, graph_degree, prefix_path, k_index);
else
std::cout << "unsupported data format. use float/int8/uint8" << std::endl;
}

View File

@@ -0,0 +1,237 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include <cstdlib>
#include <random>
#include <cmath>
#include <boost/program_options.hpp>
#include "utils.h"
namespace po = boost::program_options;
int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm,
float rand_scale)
{
auto vec = new float[ndims];
std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0, 1};
std::uniform_real_distribution<> unif_dis(1.0, rand_scale);
for (size_t i = 0; i < npts; i++)
{
float sum = 0;
float scale = 1.0f;
if (rand_scale > 1.0f)
scale = (float)unif_dis(gen);
for (size_t d = 0; d < ndims; ++d)
vec[d] = scale * (float)normal_rand(gen);
if (normalization)
{
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
}
writer.write((char *)vec, ndims * sizeof(float));
}
delete[] vec;
return 0;
}
int block_write_int8(std::ofstream &writer, size_t ndims, size_t npts, float norm)
{
auto vec = new float[ndims];
auto vec_T = new int8_t[ndims];
std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0, 1};
for (size_t i = 0; i < npts; i++)
{
float sum = 0;
for (size_t d = 0; d < ndims; ++d)
vec[d] = (float)normal_rand(gen);
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
for (size_t d = 0; d < ndims; ++d)
{
vec_T[d] = (int8_t)std::round(vec[d]);
}
writer.write((char *)vec_T, ndims * sizeof(int8_t));
}
delete[] vec;
delete[] vec_T;
return 0;
}
int block_write_uint8(std::ofstream &writer, size_t ndims, size_t npts, float norm)
{
auto vec = new float[ndims];
auto vec_T = new int8_t[ndims];
std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0, 1};
for (size_t i = 0; i < npts; i++)
{
float sum = 0;
for (size_t d = 0; d < ndims; ++d)
vec[d] = (float)normal_rand(gen);
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
for (size_t d = 0; d < ndims; ++d)
{
vec_T[d] = 128 + (int8_t)std::round(vec[d]);
}
writer.write((char *)vec_T, ndims * sizeof(uint8_t));
}
delete[] vec;
delete[] vec_T;
return 0;
}
int main(int argc, char **argv)
{
std::string data_type, output_file;
size_t ndims, npts;
float norm, rand_scaling;
bool normalization = false;
try
{
po::options_description desc{"Arguments"};
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
desc.add_options()("output_file", po::value<std::string>(&output_file)->required(),
"File name for saving the random vectors");
desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(), "Dimensoinality of the vector");
desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(), "Number of vectors");
desc.add_options()("norm", po::value<float>(&norm)->default_value(-1.0f),
"Norm of the vectors (if not specified, vectors are not normalized)");
desc.add_options()("rand_scaling", po::value<float>(&rand_scaling)->default_value(1.0f),
"Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from "
"[1, rand_scale]. Only applicable for floating point data");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &ex)
{
std::cerr << ex.what() << '\n';
return -1;
}
if (data_type != std::string("float") && data_type != std::string("int8") && data_type != std::string("uint8"))
{
std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
return -1;
}
if (norm > 0.0)
{
normalization = true;
}
if (rand_scaling < 1.0)
{
std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl;
return -1;
}
if ((rand_scaling > 1.0) && (normalization == true))
{
std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl;
return -1;
}
if (data_type == std::string("int8") || data_type == std::string("uint8"))
{
if (norm > 127)
{
std::cerr << "Error: for int8/uint8 datatypes, L2 norm can not be "
"greater "
"than 127"
<< std::endl;
return -1;
}
if (rand_scaling > 1.0)
{
std::cout << "Data scaling only supported for floating point data." << std::endl;
return -1;
}
}
try
{
std::ofstream writer;
writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
writer.open(output_file, std::ios::binary);
auto npts_u32 = (uint32_t)npts;
auto ndims_u32 = (uint32_t)ndims;
writer.write((char *)&npts_u32, sizeof(uint32_t));
writer.write((char *)&ndims_u32, sizeof(uint32_t));
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
int ret = 0;
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (data_type == std::string("float"))
{
ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling);
}
else if (data_type == std::string("int8"))
{
ret = block_write_int8(writer, ndims, cblk_size, norm);
}
else if (data_type == std::string("uint8"))
{
ret = block_write_uint8(writer, ndims, cblk_size, norm);
}
if (ret == 0)
std::cout << "Block #" << i << " written" << std::endl;
else
{
writer.close();
std::cout << "failed to write" << std::endl;
return -1;
}
}
writer.close();
}
catch (const std::exception &e)
{
std::cout << std::string(e.what()) << std::endl;
diskann::cerr << "Index build failed." << std::endl;
return -1;
}
return 0;
}

View File

@@ -0,0 +1,85 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include <cstdlib>
#include <random>
#include <cmath>
inline float aggregate_recall(const uint32_t k_aggr, const uint32_t k, const uint32_t npart, uint32_t *count,
const std::vector<float> &recalls)
{
float found = 0;
for (uint32_t i = 0; i < npart; ++i)
{
size_t max_found = std::min(count[i], k);
found += recalls[max_found - 1] * max_found;
}
return found / (float)k_aggr;
}
void simulate(const uint32_t k_aggr, const uint32_t k, const uint32_t npart, const uint32_t nsim,
const std::vector<float> &recalls)
{
std::random_device r;
std::default_random_engine randeng(r());
std::uniform_int_distribution<int> uniform_dist(0, npart - 1);
uint32_t *count = new uint32_t[npart];
double aggr_recall = 0;
for (uint32_t i = 0; i < nsim; ++i)
{
for (uint32_t p = 0; p < npart; ++p)
{
count[p] = 0;
}
for (uint32_t t = 0; t < k_aggr; ++t)
{
count[uniform_dist(randeng)]++;
}
aggr_recall += aggregate_recall(k_aggr, k, npart, count, recalls);
}
std::cout << "Aggregate recall is " << aggr_recall / (double)nsim << std::endl;
delete[] count;
}
int main(int argc, char **argv)
{
if (argc < 6)
{
std::cout << argv[0] << " k_aggregate k_out npart nsim recall@1 recall@2 ... recall@k" << std::endl;
exit(-1);
}
const uint32_t k_aggr = atoi(argv[1]);
const uint32_t k = atoi(argv[2]);
const uint32_t npart = atoi(argv[3]);
const uint32_t nsim = atoi(argv[4]);
std::vector<float> recalls;
for (int ctr = 5; ctr < argc; ctr++)
{
recalls.push_back((float)atof(argv[ctr]));
}
if (recalls.size() != k)
{
std::cerr << "Please input k numbers for recall@1, recall@2 .. recall@k" << std::endl;
}
if (k_aggr > npart * k)
{
std::cerr << "k_aggr must be <= k * npart" << std::endl;
exit(-1);
}
if (nsim <= npart * k_aggr)
{
std::cerr << "Choose nsim > npart*k_aggr" << std::endl;
exit(-1);
}
simulate(k_aggr, k, npart, nsim, recalls);
return 0;
}

View File

@@ -0,0 +1,147 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <cstdint>
#include <vector>
#include <unordered_map>
#include <omp.h>
#include <string.h>
#include <atomic>
#include <cstring>
#include <iomanip>
#include <set>
#include <boost/program_options.hpp>
#include "utils.h"
#ifndef _WINDOWS
#include <sys/mman.h>
#include <unistd.h>
#include <sys/stat.h>
#include <time.h>
#else
#include <Windows.h>
#endif
namespace po = boost::program_options;
void stats_analysis(const std::string labels_file, std::string univeral_label, uint32_t density = 10)
{
std::string token, line;
std::ifstream labels_stream(labels_file);
std::unordered_map<std::string, uint32_t> label_counts;
std::string label_with_max_points;
uint32_t max_points = 0;
long long sum = 0;
long long point_cnt = 0;
float avg_labels_per_pt, mean_label_size;
std::vector<uint32_t> labels_per_point;
uint32_t dense_pts = 0;
if (labels_stream.is_open())
{
while (getline(labels_stream, line))
{
point_cnt++;
std::stringstream iss(line);
uint32_t lbl_cnt = 0;
while (getline(iss, token, ','))
{
lbl_cnt++;
token.erase(std::remove(token.begin(), token.end(), '\n'), token.end());
token.erase(std::remove(token.begin(), token.end(), '\r'), token.end());
if (label_counts.find(token) == label_counts.end())
label_counts[token] = 0;
label_counts[token]++;
}
if (lbl_cnt >= density)
{
dense_pts++;
}
labels_per_point.emplace_back(lbl_cnt);
}
}
std::cout << "fraction of dense points with >= " << density
<< " labels = " << (float)dense_pts / (float)labels_per_point.size() << std::endl;
std::sort(labels_per_point.begin(), labels_per_point.end());
std::vector<std::pair<std::string, uint32_t>> label_count_vec;
for (auto it = label_counts.begin(); it != label_counts.end(); it++)
{
auto &lbl = *it;
label_count_vec.emplace_back(std::make_pair(lbl.first, lbl.second));
if (lbl.second > max_points)
{
max_points = lbl.second;
label_with_max_points = lbl.first;
}
sum += lbl.second;
}
sort(label_count_vec.begin(), label_count_vec.end(),
[](const std::pair<std::string, uint32_t> &lhs, const std::pair<std::string, uint32_t> &rhs) {
return lhs.second < rhs.second;
});
for (float p = 0; p < 1; p += 0.05)
{
std::cout << "Percentile " << (100 * p) << "\t" << label_count_vec[(size_t)(p * label_count_vec.size())].first
<< " with count=" << label_count_vec[(size_t)(p * label_count_vec.size())].second << std::endl;
}
std::cout << "Most common label "
<< "\t" << label_count_vec[label_count_vec.size() - 1].first
<< " with count=" << label_count_vec[label_count_vec.size() - 1].second << std::endl;
if (label_count_vec.size() > 1)
std::cout << "Second common label "
<< "\t" << label_count_vec[label_count_vec.size() - 2].first
<< " with count=" << label_count_vec[label_count_vec.size() - 2].second << std::endl;
if (label_count_vec.size() > 2)
std::cout << "Third common label "
<< "\t" << label_count_vec[label_count_vec.size() - 3].first
<< " with count=" << label_count_vec[label_count_vec.size() - 3].second << std::endl;
avg_labels_per_pt = sum / (float)point_cnt;
mean_label_size = sum / (float)label_counts.size();
std::cout << "Total number of points = " << point_cnt << ", number of labels = " << label_counts.size()
<< std::endl;
std::cout << "Average number of labels per point = " << avg_labels_per_pt << std::endl;
std::cout << "Mean label size excluding 0 = " << mean_label_size << std::endl;
std::cout << "Most popular label is " << label_with_max_points << " with " << max_points << " pts" << std::endl;
}
int main(int argc, char **argv)
{
std::string labels_file, universal_label;
uint32_t density;
po::options_description desc{"Arguments"};
try
{
desc.add_options()("help,h", "Print information on arguments");
desc.add_options()("labels_file", po::value<std::string>(&labels_file)->required(),
"path to labels data file.");
desc.add_options()("universal_label", po::value<std::string>(&universal_label)->required(),
"Universal label used in labels file.");
desc.add_options()("density", po::value<uint32_t>(&density)->default_value(1),
"Number of labels each point in labels file, defaults to 1");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
{
std::cout << desc;
return 0;
}
po::notify(vm);
}
catch (const std::exception &e)
{
std::cerr << e.what() << '\n';
return -1;
}
stats_analysis(labels_file, universal_label, density);
}

View File

@@ -0,0 +1,121 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
void block_convert_float(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
{
auto read_buf = new float[npts * (ndims + 1)];
auto cursor = read_buf;
float val;
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; ++d)
{
reader >> val;
*cursor = val;
cursor++;
}
}
writer.write((char *)read_buf, npts * ndims * sizeof(float));
delete[] read_buf;
}
void block_convert_int8(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
{
auto read_buf = new int8_t[npts * (ndims + 1)];
auto cursor = read_buf;
int val;
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; ++d)
{
reader >> val;
*cursor = (int8_t)val;
cursor++;
}
}
writer.write((char *)read_buf, npts * ndims * sizeof(uint8_t));
delete[] read_buf;
}
void block_convert_uint8(std::ifstream &reader, std::ofstream &writer, size_t npts, size_t ndims)
{
auto read_buf = new uint8_t[npts * (ndims + 1)];
auto cursor = read_buf;
int val;
for (size_t i = 0; i < npts; i++)
{
for (size_t d = 0; d < ndims; ++d)
{
reader >> val;
*cursor = (uint8_t)val;
cursor++;
}
}
writer.write((char *)read_buf, npts * ndims * sizeof(uint8_t));
delete[] read_buf;
}
int main(int argc, char **argv)
{
if (argc != 6)
{
std::cout << argv[0]
<< "<float/int8/uint8> input_filename.tsv output_filename.bin "
"dim num_pts>"
<< std::endl;
exit(-1);
}
if (std::string(argv[1]) != std::string("float") && std::string(argv[1]) != std::string("int8") &&
std::string(argv[1]) != std::string("uint8"))
{
std::cout << "Unsupported type. float, int8 and uint8 types are supported." << std::endl;
}
size_t ndims = atoi(argv[4]);
size_t npts = atoi(argv[5]);
std::ifstream reader(argv[2], std::ios::binary | std::ios::ate);
// size_t fsize = reader.tellg();
reader.seekg(0, std::ios::beg);
reader.seekg(0, std::ios::beg);
size_t blk_size = 131072;
size_t nblks = ROUND_UP(npts, blk_size) / blk_size;
std::cout << "# blks: " << nblks << std::endl;
std::ofstream writer(argv[3], std::ios::binary);
auto npts_u32 = (uint32_t)npts;
auto ndims_u32 = (uint32_t)ndims;
writer.write((char *)&npts_u32, sizeof(uint32_t));
writer.write((char *)&ndims_u32, sizeof(uint32_t));
for (size_t i = 0; i < nblks; i++)
{
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (std::string(argv[1]) == std::string("float"))
{
block_convert_float(reader, writer, cblk_size, ndims);
}
else if (std::string(argv[1]) == std::string("int8"))
{
block_convert_int8(reader, writer, cblk_size, ndims);
}
else if (std::string(argv[1]) == std::string("uint8"))
{
block_convert_uint8(reader, writer, cblk_size, ndims);
}
std::cout << "Block #" << i << " written" << std::endl;
}
reader.close();
writer.close();
}

View File

@@ -0,0 +1,23 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_uint32_bin output_int8_bin" << std::endl;
exit(-1);
}
uint32_t *input;
size_t npts, nd;
diskann::load_bin<uint32_t>(argv[1], input, npts, nd);
uint8_t *output = new uint8_t[npts * nd];
diskann::convert_types<uint32_t, uint8_t>(input, output, npts, nd);
diskann::save_bin<uint8_t>(argv[2], output, npts, nd);
delete[] output;
delete[] input;
}

View File

@@ -0,0 +1,23 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <iostream>
#include "utils.h"
int main(int argc, char **argv)
{
if (argc != 3)
{
std::cout << argv[0] << " input_uint8_bin output_float_bin" << std::endl;
exit(-1);
}
uint8_t *input;
size_t npts, nd;
diskann::load_bin<uint8_t>(argv[1], input, npts, nd);
float *output = new float[npts * nd];
diskann::convert_types<uint8_t, float>(input, output, npts, nd);
diskann::save_bin<float>(argv[2], output, npts, nd);
delete[] output;
delete[] input;
}

View File

@@ -0,0 +1,163 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#include <omp.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <ctime>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <string>
#include <fcntl.h>
#include <sys/stat.h>
#include <time.h>
#include <typeinfo>
#include "partition.h"
#include "utils.h"
template <typename T> int analyze_norm(std::string base_file)
{
std::cout << "Analyzing data norms" << std::endl;
T *data;
size_t npts, ndims;
diskann::load_bin<T>(base_file, data, npts, ndims);
std::vector<float> norms(npts, 0);
#pragma omp parallel for schedule(dynamic)
for (int64_t i = 0; i < (int64_t)npts; i++)
{
for (size_t d = 0; d < ndims; d++)
norms[i] += data[i * ndims + d] * data[i * ndims + d];
norms[i] = std::sqrt(norms[i]);
}
std::sort(norms.begin(), norms.end());
for (int p = 0; p < 100; p += 5)
std::cout << "percentile " << p << ": " << norms[(uint64_t)(std::floor((p / 100.0) * npts))] << std::endl;
std::cout << "percentile 100"
<< ": " << norms[npts - 1] << std::endl;
delete[] data;
return 0;
}
template <typename T> int normalize_base(std::string base_file, std::string out_file)
{
std::cout << "Normalizing base" << std::endl;
T *data;
size_t npts, ndims;
diskann::load_bin<T>(base_file, data, npts, ndims);
// std::vector<float> norms(npts, 0);
#pragma omp parallel for schedule(dynamic)
for (int64_t i = 0; i < (int64_t)npts; i++)
{
float pt_norm = 0;
for (size_t d = 0; d < ndims; d++)
pt_norm += data[i * ndims + d] * data[i * ndims + d];
pt_norm = std::sqrt(pt_norm);
for (size_t d = 0; d < ndims; d++)
data[i * ndims + d] = static_cast<T>(data[i * ndims + d] / pt_norm);
}
diskann::save_bin<T>(out_file, data, npts, ndims);
delete[] data;
return 0;
}
template <typename T> int augment_base(std::string base_file, std::string out_file, bool prep_base = true)
{
std::cout << "Analyzing data norms" << std::endl;
T *data;
size_t npts, ndims;
diskann::load_bin<T>(base_file, data, npts, ndims);
std::vector<float> norms(npts, 0);
float max_norm = 0;
#pragma omp parallel for schedule(dynamic)
for (int64_t i = 0; i < (int64_t)npts; i++)
{
for (size_t d = 0; d < ndims; d++)
norms[i] += data[i * ndims + d] * data[i * ndims + d];
max_norm = norms[i] > max_norm ? norms[i] : max_norm;
}
// std::sort(norms.begin(), norms.end());
max_norm = std::sqrt(max_norm);
std::cout << "Max norm: " << max_norm << std::endl;
T *new_data;
size_t newdims = ndims + 1;
new_data = new T[npts * newdims];
for (size_t i = 0; i < npts; i++)
{
if (prep_base)
{
for (size_t j = 0; j < ndims; j++)
{
new_data[i * newdims + j] = static_cast<T>(data[i * ndims + j] / max_norm);
}
float diff = 1 - (norms[i] / (max_norm * max_norm));
diff = diff <= 0 ? 0 : std::sqrt(diff);
new_data[i * newdims + ndims] = static_cast<T>(diff);
if (diff <= 0)
{
std::cout << i << " has large max norm, investigate if needed. diff = " << diff << std::endl;
}
}
else
{
for (size_t j = 0; j < ndims; j++)
{
new_data[i * newdims + j] = static_cast<T>(data[i * ndims + j] / std::sqrt(norms[i]));
}
new_data[i * newdims + ndims] = 0;
}
}
diskann::save_bin<T>(out_file, new_data, npts, newdims);
delete[] new_data;
delete[] data;
return 0;
}
template <typename T> int aux_main(char **argv)
{
std::string base_file(argv[2]);
uint32_t option = atoi(argv[3]);
if (option == 1)
analyze_norm<T>(base_file);
else if (option == 2)
augment_base<T>(base_file, std::string(argv[4]), true);
else if (option == 3)
augment_base<T>(base_file, std::string(argv[4]), false);
else if (option == 4)
normalize_base<T>(base_file, std::string(argv[4]));
return 0;
}
int main(int argc, char **argv)
{
if (argc < 4)
{
std::cout << argv[0]
<< " data_type [float/int8/uint8] base_bin_file "
"[option: 1-norm analysis, 2-prep_base_for_mip, "
"3-prep_query_for_mip, 4-normalize-vecs] [out_file for "
"options 2/3/4]"
<< std::endl;
exit(-1);
}
if (std::string(argv[1]) == std::string("float"))
{
aux_main<float>(argv);
}
else if (std::string(argv[1]) == std::string("int8"))
{
aux_main<int8_t>(argv);
}
else if (std::string(argv[1]) == std::string("uint8"))
{
aux_main<uint8_t>(argv);
}
else
std::cout << "Unsupported type. Use float/int8/uint8." << std::endl;
return 0;
}