Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include "common.h"
#include "distance.h"
namespace diskannpy
{
template <typename DT>
void build_disk_index(diskann::Metric metric, const std::string &data_file_path, const std::string &index_prefix_path,
uint32_t complexity, uint32_t graph_degree, double final_index_ram_limit,
double indexing_ram_budget, uint32_t num_threads, uint32_t pq_disk_bytes,
const std::string &codebook_prefix);
template <typename DT, typename TagT = DynamicIdType, typename LabelT = filterT>
void build_memory_index(diskann::Metric metric, const std::string &vector_bin_path,
const std::string &index_output_path, uint32_t graph_degree, uint32_t complexity, float alpha,
uint32_t num_threads, bool use_pq_build, size_t num_pq_bytes, bool use_opq,
bool use_tags = false, const std::string &filter_labels_file = "",
const std::string &universal_label = "", uint32_t filter_complexity = 0);
} // namespace diskannpy

View File

@@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <stdint.h>
#include <utility>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
namespace diskannpy
{
typedef uint32_t filterT;
typedef uint32_t StaticIdType;
typedef uint32_t DynamicIdType;
template <class IdType> using NeighborsAndDistances = std::pair<py::array_t<IdType>, py::array_t<float>>;
}; // namespace diskannpy

View File

@@ -0,0 +1,53 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "common.h"
#include "index.h"
#include "parameters.h"
namespace py = pybind11;
namespace diskannpy
{
template <typename DT>
class DynamicMemoryIndex
{
public:
DynamicMemoryIndex(diskann::Metric m, size_t dimensions, size_t max_vectors, uint32_t complexity,
uint32_t graph_degree, bool saturate_graph, uint32_t max_occlusion_size, float alpha,
uint32_t num_threads, uint32_t filter_complexity, uint32_t num_frozen_points,
uint32_t initial_search_complexity, uint32_t initial_search_threads,
bool concurrent_consolidation);
void load(const std::string &index_path);
int insert(const py::array_t<DT, py::array::c_style | py::array::forcecast> &vector, DynamicIdType id);
py::array_t<int> batch_insert(py::array_t<DT, py::array::c_style | py::array::forcecast> &vectors,
py::array_t<DynamicIdType, py::array::c_style | py::array::forcecast> &ids, int32_t num_inserts,
int num_threads = 0);
int mark_deleted(DynamicIdType id);
void save(const std::string &save_path, bool compact_before_save = false);
NeighborsAndDistances<DynamicIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn,
uint64_t complexity);
NeighborsAndDistances<DynamicIdType> batch_search(py::array_t<DT, py::array::c_style | py::array::forcecast> &queries,
uint64_t num_queries, uint64_t knn, uint64_t complexity,
uint32_t num_threads);
void consolidate_delete();
size_t num_points();
private:
const uint32_t _initial_search_complexity;
const diskann::IndexWriteParameters _write_parameters;
diskann::Index<DT, DynamicIdType, filterT> _index;
};
}; // namespace diskannpy

View File

@@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#ifdef _WINDOWS
#include "windows_aligned_file_reader.h"
#elif __APPLE__
#include "apple_aligned_file_reader.h"
#else
#include "linux_aligned_file_reader.h"
#endif
#include "common.h"
#include "pq_flash_index.h"
namespace py = pybind11;
namespace diskannpy
{
#ifdef _WINDOWS
typedef WindowsAlignedFileReader PlatformSpecificAlignedFileReader;
#elif __APPLE__
typedef AppleAlignedFileReader PlatformSpecificAlignedFileReader;
#else
typedef LinuxAlignedFileReader PlatformSpecificAlignedFileReader;
#endif
template <typename DT> class StaticDiskIndex
{
public:
StaticDiskIndex(diskann::Metric metric, const std::string &index_path_prefix, uint32_t num_threads,
size_t num_nodes_to_cache, uint32_t cache_mechanism, const std::string &pq_prefix,
const std::string &partition_prefix);
void cache_bfs_levels(size_t num_nodes_to_cache);
void cache_sample_paths(size_t num_nodes_to_cache, const std::string &warmup_query_file, uint32_t num_threads);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity, uint64_t beam_width,
bool USE_DEFERRED_FETCH = false, bool skip_search_reorder = false,
bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
float prune_ratio = 0, bool batch_recompute = false,
bool global_pruning = false);
NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint64_t beam_width, uint32_t num_threads, bool USE_DEFERRED_FETCH = false,
bool skip_search_reorder = false, bool recompute_beighbor_embeddings = false, bool dedup_node_dis = false,
float prune_ratio = 0, bool batch_recompute = false, bool global_pruning = false);
private:
std::shared_ptr<AlignedFileReader> _reader;
std::shared_ptr<AlignedFileReader> _graph_reader;
diskann::PQFlashIndex<DT> _index;
};
} // namespace diskannpy

View File

@@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma once
#include <cstdint>
#include <string>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "common.h"
#include "index.h"
namespace py = pybind11;
namespace diskannpy
{
template <typename DT> class StaticMemoryIndex
{
public:
StaticMemoryIndex(diskann::Metric m, const std::string &index_prefix, size_t num_points, size_t dimensions,
uint32_t num_threads, uint32_t initial_search_complexity);
NeighborsAndDistances<StaticIdType> search(py::array_t<DT, py::array::c_style | py::array::forcecast> &query,
uint64_t knn, uint64_t complexity);
NeighborsAndDistances<StaticIdType> search_with_filter(
py::array_t<DT, py::array::c_style | py::array::forcecast> &query, uint64_t knn, uint64_t complexity,
filterT filter);
NeighborsAndDistances<StaticIdType> batch_search(
py::array_t<DT, py::array::c_style | py::array::forcecast> &queries, uint64_t num_queries, uint64_t knn,
uint64_t complexity, uint32_t num_threads);
private:
diskann::Index<DT, StaticIdType, filterT> _index;
};
} // namespace diskannpy