Initial commit
This commit is contained in:
92
packages/leann-backend-hnsw/third_party/faiss/tests/CMakeLists.txt
vendored
Normal file
92
packages/leann-backend-hnsw/third_party/faiss/tests/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
set(FAISS_TEST_SRC
|
||||
test_binary_flat.cpp
|
||||
test_dealloc_invlists.cpp
|
||||
test_ivfpq_codec.cpp
|
||||
test_ivfpq_indexing.cpp
|
||||
test_lowlevel_ivf.cpp
|
||||
test_ivf_index.cpp
|
||||
test_merge.cpp
|
||||
test_omp_threads.cpp
|
||||
test_ondisk_ivf.cpp
|
||||
test_pairs_decoding.cpp
|
||||
test_params_override.cpp
|
||||
test_pq_encoding.cpp
|
||||
test_sliding_ivf.cpp
|
||||
test_threaded_index.cpp
|
||||
test_transfer_invlists.cpp
|
||||
test_mem_leak.cpp
|
||||
test_cppcontrib_sa_decode.cpp
|
||||
test_cppcontrib_uintreader.cpp
|
||||
test_simdlib.cpp
|
||||
test_approx_topk.cpp
|
||||
test_RCQ_cropping.cpp
|
||||
test_distances_simd.cpp
|
||||
test_heap.cpp
|
||||
test_code_distance.cpp
|
||||
test_hnsw.cpp
|
||||
test_partitioning.cpp
|
||||
test_fastscan_perf.cpp
|
||||
test_disable_pq_sdc_tables.cpp
|
||||
test_common_ivf_empty_index.cpp
|
||||
test_callback.cpp
|
||||
test_utils.cpp
|
||||
test_hamming.cpp
|
||||
test_mmap.cpp
|
||||
test_zerocopy.cpp
|
||||
)
|
||||
|
||||
add_executable(faiss_test ${FAISS_TEST_SRC})
|
||||
|
||||
include(../cmake/link_to_faiss_lib.cmake)
|
||||
|
||||
link_to_faiss_lib(faiss_test)
|
||||
|
||||
if (FAISS_ENABLE_PYTHON)
|
||||
target_link_libraries(faiss_test PUBLIC faiss_example_external_module)
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG 58d77fa8070e8cec2dc1ed015d66b454c8d78850 # release-1.12.1
|
||||
OVERRIDE_FIND_PACKAGE)
|
||||
set(BUILD_GMOCK CACHE BOOL OFF)
|
||||
set(INSTALL_GTEST CACHE BOOL OFF)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
|
||||
AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
|
||||
file(
|
||||
WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
|
||||
[=[
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(googletest)
|
||||
if(NOT TARGET GTest::GTest)
|
||||
add_library(GTest::GTest INTERFACE IMPORTED)
|
||||
target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
|
||||
endif()
|
||||
if(NOT TARGET GTest::Main)
|
||||
add_library(GTest::Main INTERFACE IMPORTED)
|
||||
target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
|
||||
endif()
|
||||
]=])
|
||||
endif()
|
||||
|
||||
find_package(OpenMP REQUIRED)
|
||||
find_package(GTest CONFIG REQUIRED)
|
||||
|
||||
target_link_libraries(faiss_test PRIVATE
|
||||
OpenMP::OpenMP_CXX
|
||||
GTest::gtest_main
|
||||
$<$<BOOL:${FAISS_ENABLE_ROCM}>:hip::host>
|
||||
)
|
||||
|
||||
# Defines `gtest_discover_tests()`.
|
||||
include(GoogleTest)
|
||||
gtest_discover_tests(faiss_test)
|
||||
127
packages/leann-backend-hnsw/third_party/faiss/tests/common_faiss_tests.py
vendored
Normal file
127
packages/leann-backend-hnsw/third_party/faiss/tests/common_faiss_tests.py
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
# a few common functions for the tests
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
|
||||
# reduce number of threads to avoid excessive nb of threads in opt
|
||||
# mode (recuces runtime from 100s to 4s!)
|
||||
faiss.omp_set_num_threads(4)
|
||||
|
||||
|
||||
def random_unitary(n, d, seed):
|
||||
x = faiss.randn(n * d, seed).reshape(n, d)
|
||||
faiss.normalize_L2(x)
|
||||
return x
|
||||
|
||||
|
||||
class Randu10k:
|
||||
|
||||
def __init__(self):
|
||||
self.nb = 10000
|
||||
self.nq = 1000
|
||||
self.nt = 10000
|
||||
self.d = 128
|
||||
|
||||
self.xb = random_unitary(self.nb, self.d, 1)
|
||||
self.xt = random_unitary(self.nt, self.d, 2)
|
||||
self.xq = random_unitary(self.nq, self.d, 3)
|
||||
|
||||
dotprods = np.dot(self.xq, self.xb.T)
|
||||
self.gt = dotprods.argmax(1)
|
||||
self.k = 100
|
||||
|
||||
def launch(self, name, index):
|
||||
if not index.is_trained:
|
||||
index.train(self.xt)
|
||||
index.add(self.xb)
|
||||
return index.search(self.xq, self.k)
|
||||
|
||||
def evalres(self, DI):
|
||||
D, I = DI
|
||||
e = {}
|
||||
for rank in 1, 10, 100:
|
||||
e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
|
||||
float(self.nq))
|
||||
return e
|
||||
|
||||
|
||||
class Randu10kUnbalanced(Randu10k):
|
||||
|
||||
def __init__(self):
|
||||
Randu10k.__init__(self)
|
||||
|
||||
weights = 0.95 ** np.arange(self.d)
|
||||
rs = np.random.RandomState(123)
|
||||
weights = weights[rs.permutation(self.d)]
|
||||
self.xb *= weights
|
||||
self.xb /= np.linalg.norm(self.xb, axis=1)[:, np.newaxis]
|
||||
self.xq *= weights
|
||||
self.xq /= np.linalg.norm(self.xq, axis=1)[:, np.newaxis]
|
||||
self.xt *= weights
|
||||
self.xt /= np.linalg.norm(self.xt, axis=1)[:, np.newaxis]
|
||||
|
||||
dotprods = np.dot(self.xq, self.xb.T)
|
||||
self.gt = dotprods.argmax(1)
|
||||
self.k = 100
|
||||
|
||||
|
||||
def get_dataset(d, nb, nt, nq):
|
||||
rs = np.random.RandomState(123)
|
||||
xb = rs.rand(nb, d).astype('float32')
|
||||
xt = rs.rand(nt, d).astype('float32')
|
||||
xq = rs.rand(nq, d).astype('float32')
|
||||
|
||||
return (xt, xb, xq)
|
||||
|
||||
|
||||
def get_dataset_2(d, nt, nb, nq):
|
||||
"""A dataset that is not completely random but still challenging to
|
||||
index
|
||||
"""
|
||||
d1 = 10 # intrinsic dimension (more or less)
|
||||
n = nb + nt + nq
|
||||
rs = np.random.RandomState(1338)
|
||||
x = rs.normal(size=(n, d1))
|
||||
x = np.dot(x, rs.rand(d1, d))
|
||||
# now we have a d1-dim ellipsoid in d-dimensional space
|
||||
# higher factor (>4) -> higher frequency -> less linear
|
||||
x = x * (rs.rand(d) * 4 + 0.1)
|
||||
x = np.sin(x)
|
||||
x = x.astype('float32')
|
||||
return x[:nt], x[nt:nt + nb], x[nt + nb:]
|
||||
|
||||
|
||||
def make_binary_dataset(d, nt, nb, nq):
|
||||
assert d % 8 == 0
|
||||
rs = np.random.RandomState(123)
|
||||
x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
|
||||
return x[:nt], x[nt:-nq], x[-nq:]
|
||||
|
||||
|
||||
def compare_binary_result_lists(D1, I1, D2, I2):
|
||||
"""comparing result lists is difficult because there are many
|
||||
ties. Here we sort by (distance, index) pairs and ignore the largest
|
||||
distance of each result. Compatible result lists should pass this."""
|
||||
assert D1.shape == I1.shape == D2.shape == I2.shape
|
||||
n, k = D1.shape
|
||||
ndiff = (D1 != D2).sum()
|
||||
assert ndiff == 0, '%d differences in distance matrix %s' % (
|
||||
ndiff, D1.shape)
|
||||
|
||||
def normalize_DI(D, I):
|
||||
norm = I.max() + 1.0
|
||||
Dr = D.astype('float64') + I / norm
|
||||
# ignore -1s and elements on last column
|
||||
Dr[I1 == -1] = 1e20
|
||||
Dr[D == D[:, -1:]] = 1e20
|
||||
Dr.sort(axis=1)
|
||||
return Dr
|
||||
ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
|
||||
assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
|
||||
66
packages/leann-backend-hnsw/third_party/faiss/tests/external_module_test.py
vendored
Normal file
66
packages/leann-backend-hnsw/third_party/faiss/tests/external_module_test.py
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import unittest
|
||||
|
||||
import faiss
|
||||
|
||||
import faiss.faiss_example_external_module as external_module
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TestCustomIDSelector(unittest.TestCase):
|
||||
"""test if we can construct a custom IDSelector"""
|
||||
|
||||
def test_IDSelector(self):
|
||||
ids = external_module.IDSelectorModulo(3)
|
||||
self.assertFalse(ids.is_member(1))
|
||||
self.assertTrue(ids.is_member(3))
|
||||
|
||||
|
||||
class TestArrayConversions(unittest.TestCase):
|
||||
|
||||
def test_idx_array(self):
|
||||
tab = np.arange(10).astype("int64")
|
||||
new_sum = external_module.sum_of_idx(len(tab), faiss.swig_ptr(tab))
|
||||
self.assertEqual(new_sum, tab.sum())
|
||||
|
||||
def do_array_test(self, ty):
|
||||
tab = np.arange(10).astype(ty)
|
||||
func = getattr(external_module, "sum_of_" + ty)
|
||||
print("perceived type", faiss.swig_ptr(tab))
|
||||
new_sum = func(len(tab), faiss.swig_ptr(tab))
|
||||
self.assertEqual(new_sum, tab.sum())
|
||||
|
||||
def test_sum_uint8(self):
|
||||
self.do_array_test("uint8")
|
||||
|
||||
def test_sum_uint16(self):
|
||||
self.do_array_test("uint16")
|
||||
|
||||
def test_sum_uint32(self):
|
||||
self.do_array_test("uint32")
|
||||
|
||||
def test_sum_uint64(self):
|
||||
self.do_array_test("uint64")
|
||||
|
||||
def test_sum_int8(self):
|
||||
self.do_array_test("int8")
|
||||
|
||||
def test_sum_int16(self):
|
||||
self.do_array_test("int16")
|
||||
|
||||
def test_sum_int32(self):
|
||||
self.do_array_test("int32")
|
||||
|
||||
def test_sum_int64(self):
|
||||
self.do_array_test("int64")
|
||||
|
||||
def test_sum_float32(self):
|
||||
self.do_array_test("float32")
|
||||
|
||||
def test_sum_float64(self):
|
||||
self.do_array_test("float64")
|
||||
85
packages/leann-backend-hnsw/third_party/faiss/tests/test_NSG_compressed_graph.cpp
vendored
Normal file
85
packages/leann-backend-hnsw/third_party/faiss/tests/test_NSG_compressed_graph.cpp
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/IndexNSG.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include <faiss/utils/random.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
using FinalNSGGraph = nsg::Graph<int32_t>;
|
||||
|
||||
struct CompressedNSGGraph : FinalNSGGraph {
|
||||
int bits;
|
||||
size_t stride;
|
||||
std::vector<uint8_t> compressed_data;
|
||||
|
||||
CompressedNSGGraph(const FinalNSGGraph& graph, int bits)
|
||||
: FinalNSGGraph(graph.data, graph.N, graph.K), bits(bits) {
|
||||
FAISS_THROW_IF_NOT((1 << bits) >= K + 1);
|
||||
stride = (K * bits + 7) / 8;
|
||||
compressed_data.resize(N * stride);
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
BitstringWriter writer(compressed_data.data() + i * stride, stride);
|
||||
for (size_t j = 0; j < K; j++) {
|
||||
int32_t v = graph.data[i * K + j];
|
||||
if (v == -1) {
|
||||
writer.write(K + 1, bits);
|
||||
break;
|
||||
} else {
|
||||
writer.write(v, bits);
|
||||
}
|
||||
}
|
||||
}
|
||||
data = nullptr;
|
||||
}
|
||||
|
||||
size_t get_neighbors(int i, int32_t* neighbors) const override {
|
||||
BitstringReader reader(compressed_data.data() + i * stride, stride);
|
||||
for (int j = 0; j < K; j++) {
|
||||
int32_t v = reader.read(bits);
|
||||
if (v == K + 1) {
|
||||
return j;
|
||||
}
|
||||
neighbors[j] = v;
|
||||
}
|
||||
return K;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(NSGCompressed, test_compressed) {
|
||||
size_t nq = 10, nt = 0, nb = 5000, d = 32, k = 10;
|
||||
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
std::vector<float> buf((nq + nb + nt) * d);
|
||||
faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
|
||||
const float* xt = buf.data();
|
||||
const float* xb = xt + nt * d;
|
||||
const float* xq = xb + nb * d;
|
||||
|
||||
faiss::IndexNSGFlat index(d, 32);
|
||||
|
||||
index.add(nb, xb);
|
||||
|
||||
std::vector<faiss::idx_t> Iref(nq * k);
|
||||
std::vector<float> Dref(nq * k);
|
||||
index.search(nq, xq, k, Dref.data(), Iref.data());
|
||||
|
||||
// replace the shared ptr
|
||||
index.nsg.final_graph.reset(
|
||||
new CompressedNSGGraph(*index.nsg.final_graph, 13));
|
||||
|
||||
std::vector<idx_t> I(nq * k);
|
||||
std::vector<float> D(nq * k);
|
||||
index.search(nq, xq, k, D.data(), I.data());
|
||||
|
||||
// make sure we find back the original results
|
||||
EXPECT_EQ(Iref, I);
|
||||
EXPECT_EQ(Dref, D);
|
||||
}
|
||||
129
packages/leann-backend-hnsw/third_party/faiss/tests/test_RCQ_cropping.cpp
vendored
Normal file
129
packages/leann-backend-hnsw/third_party/faiss/tests/test_RCQ_cropping.cpp
vendored
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/IndexAdditiveQuantizer.h>
|
||||
#include <faiss/IndexScalarQuantizer.h>
|
||||
#include <faiss/utils/random.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
/* This test creates a 3-level RCQ and performs a search on it.
|
||||
* Then it crops the RCQ to just the 2 first levels and verifies that
|
||||
* the 3-level vectors are in a subtree that was visited in the 2-level RCQ. */
|
||||
TEST(RCQCropping, test_cropping) {
|
||||
size_t nq = 10, nt = 2000, nb = 1000, d = 32;
|
||||
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
std::vector<float> buf((nq + nb + nt) * d);
|
||||
faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
|
||||
const float* xt = buf.data();
|
||||
const float* xb = xt + nt * d;
|
||||
const float* xq = xb + nb * d;
|
||||
|
||||
std::vector<size_t> nbits = {5, 4, 4};
|
||||
faiss::ResidualCoarseQuantizer rcq(d, nbits);
|
||||
|
||||
rcq.train(nt, xt);
|
||||
|
||||
// the test below works only for beam size == nprobe
|
||||
rcq.set_beam_factor(1.0);
|
||||
|
||||
// perform search
|
||||
int nprobe = 15;
|
||||
std::vector<faiss::idx_t> Iref(nq * nprobe);
|
||||
std::vector<float> Dref(nq * nprobe);
|
||||
rcq.search(nq, xq, nprobe, Dref.data(), Iref.data());
|
||||
|
||||
// crop to the first 2 quantization levels
|
||||
int last_nbits = nbits.back();
|
||||
nbits.pop_back();
|
||||
faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
|
||||
rcq_cropped.initialize_from(rcq);
|
||||
|
||||
EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
|
||||
|
||||
// perform search
|
||||
std::vector<faiss::idx_t> Inew(nq * nprobe);
|
||||
std::vector<float> Dnew(nq * nprobe);
|
||||
rcq_cropped.search(nq, xq, nprobe, Dnew.data(), Inew.data());
|
||||
|
||||
// these bits are in common between the two RCQs
|
||||
idx_t mask = ((idx_t)1 << rcq_cropped.rq.tot_bits) - 1;
|
||||
for (int q = 0; q < nq; q++) {
|
||||
for (int i = 0; i < nprobe; i++) {
|
||||
idx_t fine = Iref[q * nprobe + i];
|
||||
EXPECT_GE(fine, 0);
|
||||
bool found = false;
|
||||
|
||||
// fine should be generated from a path that passes through coarse
|
||||
for (int j = 0; j < nprobe; j++) {
|
||||
idx_t coarse = Inew[q * nprobe + j];
|
||||
if ((fine & mask) == coarse) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
EXPECT_TRUE(found);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RCQCropping, search_params) {
|
||||
size_t nq = 10, nt = 2000, nb = 1000, d = 32;
|
||||
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
std::vector<float> buf((nq + nb + nt) * d);
|
||||
faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
|
||||
const float* xt = buf.data();
|
||||
const float* xb = xt + nt * d;
|
||||
const float* xq = xb + nb * d;
|
||||
|
||||
std::vector<size_t> nbits = {3, 6, 3};
|
||||
faiss::ResidualCoarseQuantizer quantizer(d, nbits);
|
||||
size_t ntotal = (size_t)1 << quantizer.rq.tot_bits;
|
||||
faiss::IndexIVFScalarQuantizer index(
|
||||
&quantizer, d, ntotal, faiss::ScalarQuantizer::QT_8bit);
|
||||
index.quantizer_trains_alone = true;
|
||||
|
||||
index.train(nt, xt);
|
||||
index.add(nb, xb);
|
||||
|
||||
index.nprobe = 10;
|
||||
|
||||
int k = 4;
|
||||
float beam_factor_1 = 8.0;
|
||||
quantizer.set_beam_factor(beam_factor_1);
|
||||
std::vector<idx_t> I1(nq * k);
|
||||
std::vector<float> D1(nq * k);
|
||||
index.search(nq, xq, k, D1.data(), I1.data());
|
||||
|
||||
// change from 8 to 1
|
||||
quantizer.set_beam_factor(1.0f);
|
||||
std::vector<idx_t> I2(nq * k);
|
||||
std::vector<float> D2(nq * k);
|
||||
index.search(nq, xq, k, D2.data(), I2.data());
|
||||
|
||||
// make sure it changes the result
|
||||
EXPECT_NE(I1, I2);
|
||||
EXPECT_NE(D1, D2);
|
||||
|
||||
// override the class level beam factor
|
||||
faiss::SearchParametersResidualCoarseQuantizer params1;
|
||||
params1.beam_factor = beam_factor_1;
|
||||
faiss::SearchParametersIVF params;
|
||||
params.nprobe = index.nprobe;
|
||||
params.quantizer_params = ¶ms1;
|
||||
|
||||
std::vector<idx_t> I3(nq * k);
|
||||
std::vector<float> D3(nq * k);
|
||||
index.search(nq, xq, k, D3.data(), I3.data(), ¶ms);
|
||||
|
||||
// make sure we find back the original results
|
||||
EXPECT_EQ(I1, I3);
|
||||
EXPECT_EQ(D1, D3);
|
||||
}
|
||||
224
packages/leann-backend-hnsw/third_party/faiss/tests/test_approx_topk.cpp
vendored
Normal file
224
packages/leann-backend-hnsw/third_party/faiss/tests/test_approx_topk.cpp
vendored
Normal file
@@ -0,0 +1,224 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/utils/approx_topk/approx_topk.h>
|
||||
|
||||
#include <faiss/impl/FaissException.h>
|
||||
#include <faiss/utils/Heap.h>
|
||||
|
||||
//
|
||||
using namespace faiss;
|
||||
|
||||
//
|
||||
template <uint32_t NBUCKETS, uint32_t N>
|
||||
void test_approx_topk(
|
||||
const uint32_t beamSize,
|
||||
const uint32_t nPerBeam,
|
||||
const uint32_t k,
|
||||
const uint32_t nDatasetsToTest,
|
||||
const bool verbose) {
|
||||
if (verbose) {
|
||||
printf("-----------\n");
|
||||
}
|
||||
|
||||
// generate random data
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_real_distribution<float> u(0, 1);
|
||||
|
||||
// matches
|
||||
size_t nMatches = 0;
|
||||
// the element was completely missed in approx version.
|
||||
size_t nMissed = 0;
|
||||
// the element is available
|
||||
size_t nAvailable = 0;
|
||||
// the distance is the same, but the index is different.
|
||||
size_t nSoftMismatches = 0;
|
||||
// the distances are different
|
||||
size_t nHardMismatches = 0;
|
||||
// error of distances
|
||||
double sqrError = 0.0;
|
||||
|
||||
//
|
||||
double timeBaseline = 0.0;
|
||||
double timeApprox = 0.0;
|
||||
|
||||
for (size_t iDataset = 0; iDataset < nDatasetsToTest; iDataset++) {
|
||||
const size_t n = (size_t)(nPerBeam)*beamSize;
|
||||
std::vector<float> distances(n, 0);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
distances[i] = u(rng);
|
||||
}
|
||||
|
||||
//
|
||||
using C = CMax<float, int>;
|
||||
|
||||
// do a regular beam search
|
||||
std::vector<float> baselineDistances(k, C::neutral());
|
||||
std::vector<int> baselineIndices(k, -1);
|
||||
|
||||
auto startBaseline = std::chrono::high_resolution_clock::now();
|
||||
heap_addn<C>(
|
||||
k,
|
||||
baselineDistances.data(),
|
||||
baselineIndices.data(),
|
||||
distances.data(),
|
||||
nullptr,
|
||||
nPerBeam * beamSize);
|
||||
auto endBaseline = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double> diffBaseline =
|
||||
endBaseline - startBaseline;
|
||||
timeBaseline += diffBaseline.count();
|
||||
|
||||
heap_reorder<C>(k, baselineDistances.data(), baselineIndices.data());
|
||||
|
||||
// do an approximate beam search
|
||||
std::vector<float> approxDistances(k, C::neutral());
|
||||
std::vector<int> approxIndices(k, -1);
|
||||
|
||||
auto startApprox = std::chrono::high_resolution_clock::now();
|
||||
try {
|
||||
HeapWithBuckets<C, NBUCKETS, N>::bs_addn(
|
||||
beamSize,
|
||||
nPerBeam,
|
||||
distances.data(),
|
||||
k,
|
||||
approxDistances.data(),
|
||||
approxIndices.data());
|
||||
} catch (const faiss::FaissException&) {
|
||||
//
|
||||
if (verbose) {
|
||||
printf("Skipping the case.\n");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto endApprox = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double> diffApprox = endApprox - startApprox;
|
||||
timeApprox += diffApprox.count();
|
||||
|
||||
heap_reorder<C>(k, approxDistances.data(), approxIndices.data());
|
||||
|
||||
bool bGotMismatches = false;
|
||||
|
||||
// the error
|
||||
for (uint32_t i = 0; i < k; i++) {
|
||||
if (baselineDistances[i] != approxDistances[i]) {
|
||||
nHardMismatches += 1;
|
||||
|
||||
double diff = baselineDistances[i] - approxDistances[i];
|
||||
sqrError += diff * diff;
|
||||
|
||||
bGotMismatches = true;
|
||||
|
||||
if (verbose) {
|
||||
printf("i=%d, bs.d=%f, bs.i=%d, app.d=%f, app.i=%d\n",
|
||||
i,
|
||||
baselineDistances[i],
|
||||
baselineIndices[i],
|
||||
approxDistances[i],
|
||||
approxIndices[i]);
|
||||
}
|
||||
} else {
|
||||
if (baselineIndices[i] != approxIndices[i]) {
|
||||
nSoftMismatches += 1;
|
||||
} else {
|
||||
nMatches += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bGotMismatches) {
|
||||
if (verbose) {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
std::unordered_set<int> bsIndicesHS(
|
||||
baselineIndices.cbegin(), baselineIndices.cend());
|
||||
for (uint32_t i = 0; i < k; i++) {
|
||||
auto itr = bsIndicesHS.find(approxIndices[i]);
|
||||
if (itr != bsIndicesHS.cend()) {
|
||||
nAvailable += 1;
|
||||
} else {
|
||||
nMissed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
printf("%d, %d, %d, %d, %d, %d: %ld, %ld, %ld, %f, %ld, %ld, %f, %f\n",
|
||||
NBUCKETS,
|
||||
N,
|
||||
beamSize,
|
||||
nPerBeam,
|
||||
k,
|
||||
nDatasetsToTest,
|
||||
nMatches,
|
||||
nSoftMismatches,
|
||||
nHardMismatches,
|
||||
sqrError,
|
||||
nAvailable,
|
||||
nMissed,
|
||||
timeBaseline,
|
||||
timeApprox);
|
||||
}
|
||||
|
||||
// just confirm that the error is not crazy
|
||||
if (NBUCKETS * N * beamSize >= k) {
|
||||
EXPECT_TRUE(nAvailable > nMissed);
|
||||
} else {
|
||||
// it is possible that the results are crazy here. Skip it.
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
TEST(testApproxTopk, COMMON) {
|
||||
constexpr bool verbose = false;
|
||||
|
||||
//
|
||||
const uint32_t nDifferentDatasets = 8;
|
||||
|
||||
uint32_t kValues[] = {1, 2, 3, 5, 8, 13, 21, 34};
|
||||
|
||||
for (size_t codebookBitSize = 8; codebookBitSize <= 10; codebookBitSize++) {
|
||||
const uint32_t codebookSize = 1 << codebookBitSize;
|
||||
for (const auto k : kValues) {
|
||||
test_approx_topk<1 * 8, 3>(
|
||||
1, codebookSize, k, nDifferentDatasets, verbose);
|
||||
test_approx_topk<1 * 8, 3>(
|
||||
k, codebookSize, k, nDifferentDatasets, verbose);
|
||||
|
||||
test_approx_topk<1 * 8, 2>(
|
||||
1, codebookSize, k, nDifferentDatasets, verbose);
|
||||
test_approx_topk<1 * 8, 2>(
|
||||
k, codebookSize, k, nDifferentDatasets, verbose);
|
||||
|
||||
test_approx_topk<2 * 8, 2>(
|
||||
1, codebookSize, k, nDifferentDatasets, verbose);
|
||||
test_approx_topk<2 * 8, 2>(
|
||||
k, codebookSize, k, nDifferentDatasets, verbose);
|
||||
|
||||
test_approx_topk<4 * 8, 2>(
|
||||
1, codebookSize, k, nDifferentDatasets, verbose);
|
||||
test_approx_topk<4 * 8, 2>(
|
||||
k, codebookSize, k, nDifferentDatasets, verbose);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
62
packages/leann-backend-hnsw/third_party/faiss/tests/test_binary_flat.cpp
vendored
Normal file
62
packages/leann-backend-hnsw/third_party/faiss/tests/test_binary_flat.cpp
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexBinaryFlat.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
|
||||
TEST(BinaryFlat, accuracy) {
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 1000;
|
||||
|
||||
// make the index object and train it
|
||||
faiss::IndexBinaryFlat index(d);
|
||||
|
||||
std::vector<uint8_t> database(nb * (d / 8));
|
||||
for (size_t i = 0; i < nb * (d / 8); i++) {
|
||||
database[i] = rand() % 0x100;
|
||||
}
|
||||
|
||||
{ // populating the database
|
||||
index.add(nb, database.data());
|
||||
}
|
||||
|
||||
size_t nq = 200;
|
||||
|
||||
{ // searching the database
|
||||
|
||||
std::vector<uint8_t> queries(nq * (d / 8));
|
||||
for (size_t i = 0; i < nq * (d / 8); i++) {
|
||||
queries[i] = rand() % 0x100;
|
||||
}
|
||||
|
||||
int k = 5;
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<int> dis(k * nq);
|
||||
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
|
||||
for (size_t i = 0; i < nq; ++i) {
|
||||
faiss::HammingComputer8 hc(queries.data() + i * (d / 8), d / 8);
|
||||
hamdis_t dist_min = hc.hamming(database.data());
|
||||
for (size_t j = 1; j < nb; ++j) {
|
||||
hamdis_t dist = hc.hamming(database.data() + j * (d / 8));
|
||||
if (dist < dist_min) {
|
||||
dist_min = dist;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(dist_min, dis[k * i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
37
packages/leann-backend-hnsw/third_party/faiss/tests/test_callback.cpp
vendored
Normal file
37
packages/leann-backend-hnsw/third_party/faiss/tests/test_callback.cpp
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/Clustering.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
#include <faiss/impl/FaissException.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
TEST(TestCallback, timeout) {
|
||||
int n = 1000;
|
||||
int k = 100;
|
||||
int d = 128;
|
||||
int niter = 1000000000;
|
||||
int seed = 42;
|
||||
|
||||
std::vector<float> vecs(n * d);
|
||||
faiss::float_rand(vecs.data(), vecs.size(), seed);
|
||||
|
||||
auto index(new faiss::IndexFlat(d));
|
||||
|
||||
faiss::ClusteringParameters cp;
|
||||
cp.niter = niter;
|
||||
cp.verbose = false;
|
||||
|
||||
faiss::Clustering kmeans(d, k, cp);
|
||||
|
||||
faiss::TimeoutCallback::reset(0.010);
|
||||
EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
|
||||
delete index;
|
||||
}
|
||||
240
packages/leann-backend-hnsw/third_party/faiss/tests/test_code_distance.cpp
vendored
Normal file
240
packages/leann-backend-hnsw/third_party/faiss/tests/test_code_distance.cpp
vendored
Normal file
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/ProductQuantizer.h>
|
||||
#include <faiss/impl/code_distance/code_distance.h>
|
||||
|
||||
size_t nMismatches(
|
||||
const std::vector<float>& ref,
|
||||
const std::vector<float>& candidate) {
|
||||
size_t count = 0;
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
double abs = std::abs(ref[i] - candidate[i]);
|
||||
if (abs >= 1e-5) {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
void test(
|
||||
// dimensionality of the data
|
||||
const size_t dim,
|
||||
// number of subquantizers
|
||||
const size_t subq,
|
||||
// bits per subquantizer
|
||||
const size_t nbits,
|
||||
// number of codes to process
|
||||
const size_t n) {
|
||||
FAISS_THROW_IF_NOT(nbits == 8);
|
||||
|
||||
// remove if benchmarking is needed
|
||||
omp_set_num_threads(1);
|
||||
|
||||
// rng
|
||||
std::minstd_rand rng(123);
|
||||
std::uniform_int_distribution<uint8_t> u(0, 255);
|
||||
std::uniform_real_distribution<float> uf(0, 1);
|
||||
|
||||
// initialize lookup
|
||||
std::vector<float> lookup(256 * subq, 0);
|
||||
for (size_t i = 0; i < lookup.size(); i++) {
|
||||
lookup[i] = uf(rng);
|
||||
}
|
||||
|
||||
// initialize codes
|
||||
std::vector<uint8_t> codes(n * subq);
|
||||
#pragma omp parallel
|
||||
{
|
||||
std::minstd_rand rng0(123);
|
||||
std::uniform_int_distribution<uint8_t> u1(0, 255);
|
||||
|
||||
#pragma omp for schedule(guided)
|
||||
for (size_t i = 0; i < codes.size(); i++) {
|
||||
codes[i] = u1(rng0);
|
||||
}
|
||||
}
|
||||
|
||||
// warmup. compute reference results
|
||||
std::vector<float> resultsRef(n, 0);
|
||||
for (size_t k = 0; k < 10; k++) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
resultsRef[i] =
|
||||
faiss::distance_single_code_generic<faiss::PQDecoder8>(
|
||||
subq, 8, lookup.data(), codes.data() + subq * i);
|
||||
}
|
||||
}
|
||||
|
||||
// generic, 1 code per step
|
||||
std::vector<float> resultsNewGeneric1x(n, 0);
|
||||
double generic1xMsec = 0;
|
||||
{
|
||||
const auto startingTimepoint = std::chrono::steady_clock::now();
|
||||
for (size_t k = 0; k < 1000; k++) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
resultsNewGeneric1x[i] =
|
||||
faiss::distance_single_code_generic<faiss::PQDecoder8>(
|
||||
subq,
|
||||
8,
|
||||
lookup.data(),
|
||||
codes.data() + subq * i);
|
||||
}
|
||||
}
|
||||
const auto endingTimepoint = std::chrono::steady_clock::now();
|
||||
|
||||
std::chrono::duration<double> duration =
|
||||
endingTimepoint - startingTimepoint;
|
||||
generic1xMsec = (duration.count() * 1000.0);
|
||||
}
|
||||
|
||||
// generic, 4 codes per step
|
||||
std::vector<float> resultsNewGeneric4x(n, 0);
|
||||
double generic4xMsec = 0;
|
||||
{
|
||||
const auto startingTimepoint = std::chrono::steady_clock::now();
|
||||
for (size_t k = 0; k < 1000; k++) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (size_t i = 0; i < n; i += 4) {
|
||||
faiss::distance_four_codes_generic<faiss::PQDecoder8>(
|
||||
subq,
|
||||
8,
|
||||
lookup.data(),
|
||||
codes.data() + subq * (i + 0),
|
||||
codes.data() + subq * (i + 1),
|
||||
codes.data() + subq * (i + 2),
|
||||
codes.data() + subq * (i + 3),
|
||||
resultsNewGeneric4x[i + 0],
|
||||
resultsNewGeneric4x[i + 1],
|
||||
resultsNewGeneric4x[i + 2],
|
||||
resultsNewGeneric4x[i + 3]);
|
||||
}
|
||||
}
|
||||
|
||||
const auto endingTimepoint = std::chrono::steady_clock::now();
|
||||
|
||||
std::chrono::duration<double> duration =
|
||||
endingTimepoint - startingTimepoint;
|
||||
generic4xMsec = (duration.count() * 1000.0);
|
||||
}
|
||||
|
||||
// generic, 1 code per step
|
||||
std::vector<float> resultsNewCustom1x(n, 0);
|
||||
double custom1xMsec = 0;
|
||||
{
|
||||
const auto startingTimepoint = std::chrono::steady_clock::now();
|
||||
for (size_t k = 0; k < 1000; k++) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
resultsNewCustom1x[i] =
|
||||
faiss::distance_single_code<faiss::PQDecoder8>(
|
||||
subq,
|
||||
8,
|
||||
lookup.data(),
|
||||
codes.data() + subq * i);
|
||||
}
|
||||
}
|
||||
const auto endingTimepoint = std::chrono::steady_clock::now();
|
||||
|
||||
std::chrono::duration<double> duration =
|
||||
endingTimepoint - startingTimepoint;
|
||||
custom1xMsec = (duration.count() * 1000.0);
|
||||
}
|
||||
|
||||
// generic, 4 codes per step
|
||||
std::vector<float> resultsNewCustom4x(n, 0);
|
||||
double custom4xMsec = 0;
|
||||
{
|
||||
const auto startingTimepoint = std::chrono::steady_clock::now();
|
||||
for (size_t k = 0; k < 1000; k++) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (size_t i = 0; i < n; i += 4) {
|
||||
faiss::distance_four_codes<faiss::PQDecoder8>(
|
||||
subq,
|
||||
8,
|
||||
lookup.data(),
|
||||
codes.data() + subq * (i + 0),
|
||||
codes.data() + subq * (i + 1),
|
||||
codes.data() + subq * (i + 2),
|
||||
codes.data() + subq * (i + 3),
|
||||
resultsNewCustom4x[i + 0],
|
||||
resultsNewCustom4x[i + 1],
|
||||
resultsNewCustom4x[i + 2],
|
||||
resultsNewCustom4x[i + 3]);
|
||||
}
|
||||
}
|
||||
|
||||
const auto endingTimepoint = std::chrono::steady_clock::now();
|
||||
|
||||
std::chrono::duration<double> duration =
|
||||
endingTimepoint - startingTimepoint;
|
||||
custom4xMsec = (duration.count() * 1000.0);
|
||||
}
|
||||
|
||||
const size_t nMismatchesG1 = nMismatches(resultsRef, resultsNewGeneric1x);
|
||||
const size_t nMismatchesG4 = nMismatches(resultsRef, resultsNewGeneric4x);
|
||||
const size_t nMismatchesCustom1 =
|
||||
nMismatches(resultsRef, resultsNewCustom1x);
|
||||
const size_t nMismatchesCustom4 =
|
||||
nMismatches(resultsRef, resultsNewCustom4x);
|
||||
|
||||
std::cout << "Dim = " << dim << ", subq = " << subq << ", nbits = " << nbits
|
||||
<< ", n = " << n << std::endl;
|
||||
std::cout << "Generic 1x code: " << generic1xMsec << " msec, "
|
||||
<< nMismatchesG1 << " mismatches" << std::endl;
|
||||
std::cout << "Generic 4x code: " << generic4xMsec << " msec, "
|
||||
<< nMismatchesG4 << " mismatches" << std::endl;
|
||||
std::cout << "custom 1x code: " << custom1xMsec << " msec, "
|
||||
<< nMismatchesCustom1 << " mismatches" << std::endl;
|
||||
std::cout << "custom 4x code: " << custom4xMsec << " msec, "
|
||||
<< nMismatchesCustom4 << " mismatches" << std::endl;
|
||||
std::cout << std::endl;
|
||||
|
||||
ASSERT_EQ(nMismatchesG1, 0);
|
||||
ASSERT_EQ(nMismatchesG4, 0);
|
||||
ASSERT_EQ(nMismatchesCustom1, 0);
|
||||
ASSERT_EQ(nMismatchesCustom4, 0);
|
||||
}
|
||||
|
||||
// this test can be used as a benchmark.
|
||||
// 1. Increase the value of NELEMENTS
|
||||
// 2. Remove omp_set_num_threads()
|
||||
|
||||
constexpr size_t NELEMENTS = 10000;
|
||||
|
||||
TEST(TestCodeDistance, SUBQ4_NBITS8) {
|
||||
test(256, 4, 8, NELEMENTS);
|
||||
}
|
||||
|
||||
TEST(TestCodeDistance, SUBQ8_NBITS8) {
|
||||
test(256, 8, 8, NELEMENTS);
|
||||
}
|
||||
|
||||
TEST(TestCodeDistance, SUBQ16_NBITS8) {
|
||||
test(256, 16, 8, NELEMENTS);
|
||||
}
|
||||
|
||||
TEST(TestCodeDistance, SUBQ32_NBITS8) {
|
||||
test(256, 32, 8, NELEMENTS);
|
||||
}
|
||||
148
packages/leann-backend-hnsw/third_party/faiss/tests/test_common_ivf_empty_index.cpp
vendored
Normal file
148
packages/leann-backend-hnsw/third_party/faiss/tests/test_common_ivf_empty_index.cpp
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/clone_index.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/index_factory.h>
|
||||
#include <faiss/invlists/InvertedLists.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
/* This demonstrates how to query several independent IVF indexes with a trained
|
||||
*index in common. This avoids to duplicate the coarse quantizer and metadata
|
||||
*in memory.
|
||||
**/
|
||||
|
||||
namespace {
|
||||
|
||||
int d = 64;
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<float> get_random_vectors(size_t n, int seed) {
|
||||
std::vector<float> x(n * d);
|
||||
faiss::rand_smooth_vectors(n, d, x.data(), seed);
|
||||
seed++;
|
||||
return x;
|
||||
}
|
||||
|
||||
/** InvetedLists implementation that dispatches the search to an InvertedList
|
||||
* object that is passed in at query time */
|
||||
|
||||
struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
|
||||
DispatchingInvertedLists(size_t nlist, size_t code_size)
|
||||
: faiss::ReadOnlyInvertedLists(nlist, code_size) {
|
||||
use_iterator = true;
|
||||
}
|
||||
|
||||
faiss::InvertedListsIterator* get_iterator(
|
||||
size_t list_no,
|
||||
void* inverted_list_context = nullptr) const override {
|
||||
assert(inverted_list_context);
|
||||
auto il =
|
||||
static_cast<const faiss::InvertedLists*>(inverted_list_context);
|
||||
return il->get_iterator(list_no);
|
||||
}
|
||||
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
size_t list_size(size_t list_no) const override {
|
||||
FAISS_THROW_MSG("use iterator interface");
|
||||
}
|
||||
const uint8_t* get_codes(size_t list_no) const override {
|
||||
FAISS_THROW_MSG("use iterator interface");
|
||||
}
|
||||
const idx_t* get_ids(size_t list_no) const override {
|
||||
FAISS_THROW_MSG("use iterator interface");
|
||||
}
|
||||
};
|
||||
|
||||
TEST(COMMON, test_common_trained_index) {
|
||||
int N = 3; // number of independent indexes
|
||||
int nt = 500; // training vectors
|
||||
int nb = 200; // nb database vectors per index
|
||||
int nq = 10; // nb queries performed on each index
|
||||
int k = 4; // restults requested per query
|
||||
|
||||
// construct and build an "empty index": a trained index that does not
|
||||
// itself hold any data
|
||||
std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
|
||||
faiss::index_factory(d, "IVF32,PQ8np")));
|
||||
auto xt = get_random_vectors(nt, 123);
|
||||
empty_index->train(nt, xt.data());
|
||||
empty_index->nprobe = 4;
|
||||
|
||||
// reference run: build one index for each set of db / queries and record
|
||||
// results
|
||||
std::vector<std::vector<faiss::idx_t>> ref_I(N);
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
// clone the empty index
|
||||
std::unique_ptr<faiss::Index> index(
|
||||
faiss::clone_index(empty_index.get()));
|
||||
auto xb = get_random_vectors(nb, 1234 + i);
|
||||
auto xq = get_random_vectors(nq, 12345 + i);
|
||||
// add vectors and perform a search
|
||||
index->add(nb, xb.data());
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<faiss::idx_t> I(k * nq);
|
||||
index->search(nq, xq.data(), k, D.data(), I.data());
|
||||
// record result as reference
|
||||
ref_I[i] = I;
|
||||
}
|
||||
|
||||
// build a set of inverted lists for each independent index
|
||||
std::vector<faiss::ArrayInvertedLists> sub_invlists;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
// swap in other inverted lists
|
||||
sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
|
||||
faiss::InvertedLists* invlists = &sub_invlists.back();
|
||||
|
||||
// replace_invlists swaps in a new InvertedLists for an existing index
|
||||
empty_index->replace_invlists(invlists, false);
|
||||
empty_index->reset(); // reset id counter to 0
|
||||
// populate inverted lists
|
||||
auto xb = get_random_vectors(nb, 1234 + i);
|
||||
empty_index->add(nb, xb.data());
|
||||
}
|
||||
|
||||
// perform search dispatching to the sub-invlists. At search time, we don't
|
||||
// use replace_invlists because that would wreak havoc in a multithreaded
|
||||
// context
|
||||
DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
|
||||
empty_index->replace_invlists(&di, false);
|
||||
|
||||
std::vector<std::vector<faiss::idx_t>> new_I(N);
|
||||
|
||||
// run searches in the independent indexes but with a common empty_index
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < N; i++) {
|
||||
auto xq = get_random_vectors(nq, 12345 + i);
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<faiss::idx_t> I(k * nq);
|
||||
|
||||
// here we set to what sub-index the queries should be directed
|
||||
faiss::SearchParametersIVF params;
|
||||
params.nprobe = empty_index->nprobe;
|
||||
params.inverted_list_context = &sub_invlists[i];
|
||||
|
||||
empty_index->search(nq, xq.data(), k, D.data(), I.data(), ¶ms);
|
||||
new_I[i] = I;
|
||||
}
|
||||
|
||||
// compare with reference reslt
|
||||
for (int i = 0; i < N; i++) {
|
||||
ASSERT_EQ(ref_I[i], new_I[i]);
|
||||
}
|
||||
}
|
||||
1306
packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_sa_decode.cpp
vendored
Normal file
1306
packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_sa_decode.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
114
packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_uintreader.cpp
vendored
Normal file
114
packages/leann-backend-hnsw/third_party/faiss/tests/test_cppcontrib_uintreader.cpp
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// This test was designed to be run using valgrind or ASAN to test the
|
||||
// correctness of memory accesses.
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
|
||||
#include <faiss/utils/hamming.h>
|
||||
|
||||
#include <faiss/cppcontrib/detail/UintReader.h>
|
||||
|
||||
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
|
||||
struct TestLoop {
|
||||
static void test(
|
||||
const uint8_t* const container,
|
||||
faiss::BitstringReader& br) {
|
||||
// validate
|
||||
const intptr_t uintreader_data = faiss::cppcontrib::detail::
|
||||
UintReaderRaw<N_ELEMENTS, CODE_BITS, CPOS>::get(container);
|
||||
const intptr_t bitstringreader_data = br.read(CODE_BITS);
|
||||
|
||||
ASSERT_EQ(uintreader_data, bitstringreader_data)
|
||||
<< "Mismatch between BitstringReader (" << bitstringreader_data
|
||||
<< ") and UintReader (" << uintreader_data
|
||||
<< ") for N_ELEMENTS=" << N_ELEMENTS
|
||||
<< ", CODE_BITS=" << CODE_BITS << ", CPOS=" << CPOS;
|
||||
|
||||
//
|
||||
TestLoop<N_ELEMENTS, CODE_BITS, CPOS + 1>::test(container, br);
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
|
||||
struct TestLoop<N_ELEMENTS, CODE_BITS, N_ELEMENTS> {
|
||||
static void test(
|
||||
const uint8_t* const container,
|
||||
faiss::BitstringReader& br) {}
|
||||
};
|
||||
|
||||
template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
|
||||
void TestUintReader() {
|
||||
constexpr intptr_t CODE_BYTES = (CODE_BITS * N_ELEMENTS + 7) / 8;
|
||||
|
||||
std::default_random_engine rng;
|
||||
std::uniform_int_distribution<uint64_t> u(0, 1 << CODE_BITS);
|
||||
|
||||
// do several attempts
|
||||
for (size_t attempt = 0; attempt < 10; attempt++) {
|
||||
// allocate a buffer. This way, not std::vector
|
||||
std::unique_ptr<uint8_t[]> container(new uint8_t[CODE_BYTES]);
|
||||
// make it empty
|
||||
for (size_t i = 0; i < CODE_BYTES; i++) {
|
||||
container.get()[i] = 0;
|
||||
}
|
||||
|
||||
// populate it
|
||||
faiss::BitstringWriter bw(container.get(), CODE_BYTES);
|
||||
for (size_t i = 0; i < N_ELEMENTS; i++) {
|
||||
bw.write(u(rng), CODE_BITS);
|
||||
}
|
||||
|
||||
// read it back and verify against bitreader
|
||||
faiss::BitstringReader br(container.get(), CODE_BYTES);
|
||||
|
||||
TestLoop<N_ELEMENTS, CODE_BITS, 0>::test(container.get(), br);
|
||||
}
|
||||
}
|
||||
|
||||
template <intptr_t CODE_BITS>
|
||||
void TestUintReaderBits() {
|
||||
TestUintReader<1, CODE_BITS>();
|
||||
TestUintReader<2, CODE_BITS>();
|
||||
TestUintReader<3, CODE_BITS>();
|
||||
TestUintReader<4, CODE_BITS>();
|
||||
TestUintReader<5, CODE_BITS>();
|
||||
TestUintReader<6, CODE_BITS>();
|
||||
TestUintReader<7, CODE_BITS>();
|
||||
TestUintReader<8, CODE_BITS>();
|
||||
TestUintReader<9, CODE_BITS>();
|
||||
TestUintReader<10, CODE_BITS>();
|
||||
TestUintReader<11, CODE_BITS>();
|
||||
TestUintReader<12, CODE_BITS>();
|
||||
TestUintReader<13, CODE_BITS>();
|
||||
TestUintReader<14, CODE_BITS>();
|
||||
TestUintReader<15, CODE_BITS>();
|
||||
TestUintReader<16, CODE_BITS>();
|
||||
TestUintReader<17, CODE_BITS>();
|
||||
}
|
||||
|
||||
TEST(testCppcontribUintreader, Test8bit) {
|
||||
TestUintReaderBits<8>();
|
||||
}
|
||||
|
||||
TEST(testCppcontribUintreader, Test10bit) {
|
||||
TestUintReaderBits<10>();
|
||||
}
|
||||
|
||||
TEST(testCppcontribUintreader, Test12bit) {
|
||||
TestUintReaderBits<12>();
|
||||
}
|
||||
|
||||
TEST(testCppcontribUintreader, Test16bit) {
|
||||
TestUintReaderBits<16>();
|
||||
}
|
||||
170
packages/leann-backend-hnsw/third_party/faiss/tests/test_dealloc_invlists.cpp
vendored
Normal file
170
packages/leann-backend-hnsw/third_party/faiss/tests/test_dealloc_invlists.cpp
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
namespace {
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 32;
|
||||
|
||||
// nb of training vectors
|
||||
size_t nt = 5000;
|
||||
|
||||
// size of the database points per window step
|
||||
size_t nb = 1000;
|
||||
|
||||
// nb of queries
|
||||
size_t nq = 200;
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
std::vector<float> make_data(size_t n) {
|
||||
std::vector<float> database(n * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<Index> make_trained_index(const char* index_type) {
|
||||
auto index = std::unique_ptr<Index>(index_factory(d, index_type));
|
||||
auto xt = make_data(nt * d);
|
||||
index->train(nt, xt.data());
|
||||
ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
|
||||
return index;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index(Index* index, const float* xq) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
return I;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test functions for a given index type
|
||||
*************************************************************/
|
||||
|
||||
struct EncapsulateInvertedLists : InvertedLists {
|
||||
const InvertedLists* il;
|
||||
|
||||
EncapsulateInvertedLists(const InvertedLists* il)
|
||||
: InvertedLists(il->nlist, il->code_size), il(il) {}
|
||||
|
||||
static void* memdup(const void* m, size_t size) {
|
||||
if (size == 0)
|
||||
return nullptr;
|
||||
return memcpy(malloc(size), m, size);
|
||||
}
|
||||
|
||||
size_t list_size(size_t list_no) const override {
|
||||
return il->list_size(list_no);
|
||||
}
|
||||
|
||||
const uint8_t* get_codes(size_t list_no) const override {
|
||||
return (uint8_t*)memdup(
|
||||
il->get_codes(list_no), list_size(list_no) * code_size);
|
||||
}
|
||||
|
||||
const idx_t* get_ids(size_t list_no) const override {
|
||||
return (idx_t*)memdup(
|
||||
il->get_ids(list_no), list_size(list_no) * sizeof(idx_t));
|
||||
}
|
||||
|
||||
void release_codes(size_t, const uint8_t* codes) const override {
|
||||
free((void*)codes);
|
||||
}
|
||||
|
||||
void release_ids(size_t, const idx_t* ids) const override {
|
||||
free((void*)ids);
|
||||
}
|
||||
|
||||
const uint8_t* get_single_code(size_t list_no, size_t offset)
|
||||
const override {
|
||||
return (uint8_t*)memdup(
|
||||
il->get_single_code(list_no, offset), code_size);
|
||||
}
|
||||
|
||||
size_t add_entries(size_t, size_t, const idx_t*, const uint8_t*) override {
|
||||
assert(!"not implemented");
|
||||
return 0;
|
||||
}
|
||||
|
||||
void update_entries(size_t, size_t, size_t, const idx_t*, const uint8_t*)
|
||||
override {
|
||||
assert(!"not implemented");
|
||||
}
|
||||
|
||||
void resize(size_t, size_t) override {
|
||||
assert(!"not implemented");
|
||||
}
|
||||
|
||||
~EncapsulateInvertedLists() override {}
|
||||
};
|
||||
|
||||
int test_dealloc_invlists(const char* index_key) {
|
||||
std::unique_ptr<Index> index = make_trained_index(index_key);
|
||||
IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
|
||||
|
||||
auto xb = make_data(nb * d);
|
||||
index->add(nb, xb.data());
|
||||
|
||||
auto xq = make_data(nq * d);
|
||||
|
||||
auto ref_res = search_index(index.get(), xq.data());
|
||||
|
||||
EncapsulateInvertedLists eil(index_ivf->invlists);
|
||||
|
||||
index_ivf->own_invlists = false;
|
||||
index_ivf->replace_invlists(&eil, false);
|
||||
|
||||
// TEST: this could crash or leak mem
|
||||
auto new_res = search_index(index.get(), xq.data());
|
||||
|
||||
// delete explicitly
|
||||
delete eil.il;
|
||||
|
||||
// just to make sure
|
||||
EXPECT_EQ(ref_res, new_res);
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
/*************************************************************
|
||||
* Test entry points
|
||||
*************************************************************/
|
||||
|
||||
TEST(TestIvlistDealloc, IVFFlat) {
|
||||
test_dealloc_invlists("IVF32,Flat");
|
||||
}
|
||||
|
||||
TEST(TestIvlistDealloc, IVFSQ) {
|
||||
test_dealloc_invlists("IVF32,SQ8");
|
||||
}
|
||||
|
||||
TEST(TestIvlistDealloc, IVFPQ) {
|
||||
test_dealloc_invlists("IVF32,PQ4np");
|
||||
}
|
||||
69
packages/leann-backend-hnsw/third_party/faiss/tests/test_disable_pq_sdc_tables.cpp
vendored
Normal file
69
packages/leann-backend-hnsw/third_party/faiss/tests/test_disable_pq_sdc_tables.cpp
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <random>
|
||||
|
||||
#include "faiss/Index.h"
|
||||
#include "faiss/IndexHNSW.h"
|
||||
#include "faiss/index_factory.h"
|
||||
#include "faiss/index_io.h"
|
||||
#include "test_util.h"
|
||||
|
||||
pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
TEST(IO, TestReadHNSWPQ_whenSDCDisabledFlagPassed_thenDisableSDCTable) {
|
||||
// Create a temp file name with a randomized component for stress runs
|
||||
std::random_device rd;
|
||||
std::mt19937 mt(rd());
|
||||
std::uniform_real_distribution<float> dist(0, 9999999);
|
||||
std::string temp_file_name =
|
||||
"/tmp/faiss_TestReadHNSWPQ" + std::to_string(int(dist(mt)));
|
||||
Tempfilename index_filename(&temp_file_mutex, temp_file_name);
|
||||
|
||||
// Create a HNSW index with PQ encoding
|
||||
int d = 32, n = 256;
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_real_distribution<float> u(0, 100);
|
||||
std::vector<float> vectors(n * d);
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
vectors[i] = u(rng);
|
||||
}
|
||||
|
||||
// Build the index and write it to the temp file
|
||||
{
|
||||
std::unique_ptr<faiss::Index> index_writer(
|
||||
faiss::index_factory(d, "HNSW8,PQ4np", faiss::METRIC_L2));
|
||||
index_writer->train(n, vectors.data());
|
||||
index_writer->add(n, vectors.data());
|
||||
|
||||
faiss::write_index(index_writer.get(), index_filename.c_str());
|
||||
}
|
||||
|
||||
// Load index from disk. Confirm that the sdc table is equal to 0 when
|
||||
// disable sdc is set
|
||||
{
|
||||
std::unique_ptr<faiss::IndexHNSWPQ> index_reader_read_write(
|
||||
dynamic_cast<faiss::IndexHNSWPQ*>(
|
||||
faiss::read_index(index_filename.c_str())));
|
||||
std::unique_ptr<faiss::IndexHNSWPQ> index_reader_sdc_disabled(
|
||||
dynamic_cast<faiss::IndexHNSWPQ*>(faiss::read_index(
|
||||
index_filename.c_str(),
|
||||
faiss::IO_FLAG_PQ_SKIP_SDC_TABLE)));
|
||||
|
||||
ASSERT_NE(
|
||||
dynamic_cast<faiss::IndexPQ*>(index_reader_read_write->storage)
|
||||
->pq.sdc_table.size(),
|
||||
0);
|
||||
ASSERT_EQ(
|
||||
dynamic_cast<faiss::IndexPQ*>(
|
||||
index_reader_sdc_disabled->storage)
|
||||
->pq.sdc_table.size(),
|
||||
0);
|
||||
}
|
||||
}
|
||||
334
packages/leann-backend-hnsw/third_party/faiss/tests/test_distances_simd.cpp
vendored
Normal file
334
packages/leann-backend-hnsw/third_party/faiss/tests/test_distances_simd.cpp
vendored
Normal file
@@ -0,0 +1,334 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/utils/distances.h>
|
||||
|
||||
// reference implementations
|
||||
void fvec_inner_products_ny_ref(
|
||||
float* ip,
|
||||
const float* x,
|
||||
const float* y,
|
||||
size_t d,
|
||||
size_t ny) {
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
ip[i] = faiss::fvec_inner_product(x, y, d);
|
||||
y += d;
|
||||
}
|
||||
}
|
||||
|
||||
void fvec_L2sqr_ny_ref(
|
||||
float* dis,
|
||||
const float* x,
|
||||
const float* y,
|
||||
size_t d,
|
||||
size_t ny) {
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
dis[i] = faiss::fvec_L2sqr(x, y, d);
|
||||
y += d;
|
||||
}
|
||||
}
|
||||
|
||||
// test templated versions of fvec_L2sqr_ny
|
||||
TEST(TestFvecL2sqrNy, D2) {
|
||||
// we're using int values in order to get 100% accurate
|
||||
// results with floats.
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> u(0, 32);
|
||||
|
||||
for (const auto dim : {2, 4, 8, 12}) {
|
||||
std::vector<float> x(dim, 0);
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = u(rng);
|
||||
}
|
||||
|
||||
for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
|
||||
std::vector<float> y(nrows * dim);
|
||||
for (size_t i = 0; i < y.size(); i++) {
|
||||
y[i] = u(rng);
|
||||
}
|
||||
|
||||
std::vector<float> distances(nrows, 0);
|
||||
faiss::fvec_L2sqr_ny(
|
||||
distances.data(), x.data(), y.data(), dim, nrows);
|
||||
|
||||
std::vector<float> distances_ref(nrows, 0);
|
||||
fvec_L2sqr_ny_ref(
|
||||
distances_ref.data(), x.data(), y.data(), dim, nrows);
|
||||
|
||||
ASSERT_EQ(distances, distances_ref)
|
||||
<< "Mismatching results for dim = " << dim
|
||||
<< ", nrows = " << nrows;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fvec_inner_products_ny
|
||||
TEST(TestFvecInnerProductsNy, D2) {
|
||||
// we're using int values in order to get 100% accurate
|
||||
// results with floats.
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> u(0, 32);
|
||||
|
||||
for (const auto dim : {2, 4, 8, 12}) {
|
||||
std::vector<float> x(dim, 0);
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = u(rng);
|
||||
}
|
||||
|
||||
for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
|
||||
std::vector<float> y(nrows * dim);
|
||||
for (size_t i = 0; i < y.size(); i++) {
|
||||
y[i] = u(rng);
|
||||
}
|
||||
|
||||
std::vector<float> distances(nrows, 0);
|
||||
faiss::fvec_inner_products_ny(
|
||||
distances.data(), x.data(), y.data(), dim, nrows);
|
||||
|
||||
std::vector<float> distances_ref(nrows, 0);
|
||||
fvec_inner_products_ny_ref(
|
||||
distances_ref.data(), x.data(), y.data(), dim, nrows);
|
||||
|
||||
ASSERT_EQ(distances, distances_ref)
|
||||
<< "Mismatching results for dim = " << dim
|
||||
<< ", nrows = " << nrows;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecL2sqr, distances_L2_squared_y_transposed) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// modulo 8 results - 16 is to repeat the loop in the function
|
||||
int ny = 11; // this value will hit all the codepaths
|
||||
for (const auto d : {1, 2, 3, 4, 5, 6, 7, 8, 16}) {
|
||||
// initialize inputs
|
||||
std::vector<float> x(d);
|
||||
float x_sqlen = 0;
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = uniform(rng);
|
||||
x_sqlen += x[i] * x[i];
|
||||
}
|
||||
std::vector<float> y(d * ny);
|
||||
std::vector<float> y_sqlens(ny, 0);
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
for (size_t j = 0; j < y.size(); j++) {
|
||||
y[j] = uniform(rng);
|
||||
y_sqlens[i] += y[j] * y[j];
|
||||
}
|
||||
}
|
||||
|
||||
// perform function
|
||||
std::vector<float> true_distances(ny, 0);
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
float dp = 0;
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
dp += x[j] * y[i + j * ny];
|
||||
}
|
||||
true_distances[i] = x_sqlen + y_sqlens[i] - 2 * dp;
|
||||
}
|
||||
|
||||
std::vector<float> distances(ny);
|
||||
faiss::fvec_L2sqr_ny_transposed(
|
||||
distances.data(),
|
||||
x.data(),
|
||||
y.data(),
|
||||
y_sqlens.data(),
|
||||
d,
|
||||
ny, // no need for special offset to test all lines of code
|
||||
ny);
|
||||
|
||||
ASSERT_EQ(distances, true_distances)
|
||||
<< "Mismatching fvec_L2sqr_ny_transposed results for d = " << d;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecL2sqr, nearest_L2_squared_y_transposed) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// modulo 8 results - 16 is to repeat the loop in the function
|
||||
int ny = 11; // this value will hit all the codepaths
|
||||
for (const auto d : {1, 2, 3, 4, 5, 6, 7, 8, 16}) {
|
||||
// initialize inputs
|
||||
std::vector<float> x(d);
|
||||
float x_sqlen = 0;
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = uniform(rng);
|
||||
x_sqlen += x[i] * x[i];
|
||||
}
|
||||
std::vector<float> y(d * ny);
|
||||
std::vector<float> y_sqlens(ny, 0);
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
for (size_t j = 0; j < y.size(); j++) {
|
||||
y[j] = uniform(rng);
|
||||
y_sqlens[i] += y[j] * y[j];
|
||||
}
|
||||
}
|
||||
|
||||
// get distances
|
||||
std::vector<float> distances(ny, 0);
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
float dp = 0;
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
dp += x[j] * y[i + j * ny];
|
||||
}
|
||||
distances[i] = x_sqlen + y_sqlens[i] - 2 * dp;
|
||||
}
|
||||
// find nearest
|
||||
size_t true_nearest_idx = 0;
|
||||
float min_dis = HUGE_VALF;
|
||||
for (size_t i = 0; i < ny; i++) {
|
||||
if (distances[i] < min_dis) {
|
||||
min_dis = distances[i];
|
||||
true_nearest_idx = i;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> buffer(ny);
|
||||
size_t nearest_idx = faiss::fvec_L2sqr_ny_nearest_y_transposed(
|
||||
buffer.data(),
|
||||
x.data(),
|
||||
y.data(),
|
||||
y_sqlens.data(),
|
||||
d,
|
||||
ny, // no need for special offset to test all lines of code
|
||||
ny);
|
||||
|
||||
ASSERT_EQ(nearest_idx, true_nearest_idx)
|
||||
<< "Mismatching fvec_L2sqr_ny_nearest_y_transposed results for d = "
|
||||
<< d;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecL1, manhattan_distance) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// modulo 8 results - 16 is to repeat the while loop in the function
|
||||
for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
|
||||
std::vector<float> x(nrows);
|
||||
std::vector<float> y(nrows);
|
||||
float true_distance = 0;
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = uniform(rng);
|
||||
y[i] = uniform(rng);
|
||||
true_distance += std::abs(x[i] - y[i]);
|
||||
}
|
||||
|
||||
auto distance = faiss::fvec_L1(x.data(), y.data(), x.size());
|
||||
|
||||
ASSERT_EQ(distance, true_distance)
|
||||
<< "Mismatching fvec_Linf results for nrows = " << nrows;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecLinf, chebyshev_distance) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// modulo 8 results - 16 is to repeat the while loop in the function
|
||||
for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
|
||||
std::vector<float> x(nrows);
|
||||
std::vector<float> y(nrows);
|
||||
float true_distance = 0;
|
||||
for (size_t i = 0; i < x.size(); i++) {
|
||||
x[i] = uniform(rng);
|
||||
y[i] = uniform(rng);
|
||||
true_distance = std::max(true_distance, std::abs(x[i] - y[i]));
|
||||
}
|
||||
|
||||
auto distance = faiss::fvec_Linf(x.data(), y.data(), x.size());
|
||||
|
||||
ASSERT_EQ(distance, true_distance)
|
||||
<< "Mismatching fvec_Linf results for nrows = " << nrows;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecMadd, multiple_add) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// modulo 8 results - 16 is to repeat the while loop in the function
|
||||
for (const auto nrows : {8, 9, 10, 11, 12, 13, 14, 15, 16}) {
|
||||
std::vector<float> a(nrows);
|
||||
std::vector<float> b(nrows);
|
||||
const float bf = uniform(rng);
|
||||
std::vector<float> true_distances(nrows);
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
a[i] = uniform(rng);
|
||||
b[i] = uniform(rng);
|
||||
true_distances[i] = a[i] + bf * b[i];
|
||||
}
|
||||
|
||||
std::vector<float> distances(nrows);
|
||||
faiss::fvec_madd(a.size(), a.data(), bf, b.data(), distances.data());
|
||||
|
||||
ASSERT_EQ(distances, true_distances)
|
||||
<< "Mismatching fvec_madd results for nrows = " << nrows;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecAdd, add_array) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
|
||||
std::vector<float> a(nrows);
|
||||
std::vector<float> b(nrows);
|
||||
std::vector<float> true_distances(nrows);
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
a[i] = uniform(rng);
|
||||
b[i] = uniform(rng);
|
||||
true_distances[i] = a[i] + b[i];
|
||||
}
|
||||
|
||||
std::vector<float> distances(nrows);
|
||||
faiss::fvec_add(a.size(), a.data(), b.data(), distances.data());
|
||||
|
||||
ASSERT_EQ(distances, true_distances)
|
||||
<< "Mismatching array-array fvec_add results for nrows = "
|
||||
<< nrows;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestFvecAdd, add_value) {
|
||||
// ints instead of floats for 100% accuracy
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
|
||||
std::vector<float> a(nrows);
|
||||
const float b = uniform(rng); // value to add
|
||||
std::vector<float> true_distances(nrows);
|
||||
for (size_t i = 0; i < a.size(); i++) {
|
||||
a[i] = uniform(rng);
|
||||
true_distances[i] = a[i] + b;
|
||||
}
|
||||
|
||||
std::vector<float> distances(nrows);
|
||||
faiss::fvec_add(a.size(), a.data(), b, distances.data());
|
||||
|
||||
ASSERT_EQ(distances, true_distances)
|
||||
<< "Mismatching array-value fvec_add results for nrows = "
|
||||
<< nrows;
|
||||
}
|
||||
}
|
||||
46
packages/leann-backend-hnsw/third_party/faiss/tests/test_factory_tools.cpp
vendored
Normal file
46
packages/leann-backend-hnsw/third_party/faiss/tests/test_factory_tools.cpp
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/cppcontrib/factory_tools.h>
|
||||
#include <faiss/index_factory.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
TEST(TestFactoryTools, TestReverseIndexFactory) {
|
||||
for (const char* factory : {
|
||||
"Flat",
|
||||
"IMI2x5,PQ8x8",
|
||||
"IVF32_HNSW32,SQ8",
|
||||
"IVF8,Flat",
|
||||
"IVF8,SQ4",
|
||||
"IVF8,PQ4x8",
|
||||
"LSHrt",
|
||||
"PQ4x8",
|
||||
"HNSW32",
|
||||
"SQ8",
|
||||
"SQfp16",
|
||||
"NSG24,Flat",
|
||||
"NSG16,SQ8",
|
||||
}) {
|
||||
std::unique_ptr<Index> index{index_factory(64, factory)};
|
||||
ASSERT_TRUE(index);
|
||||
EXPECT_EQ(factory, reverse_index_factory(index.get()));
|
||||
}
|
||||
using Case = std::pair<const char*, const char*>;
|
||||
for (auto [src, dst] : {
|
||||
Case{"SQ8,RFlat", "SQ8,Refine(Flat)"},
|
||||
Case{"NSG", "NSG32,Flat"},
|
||||
Case{"NSG,PQ8", "NSG32,PQ8x8"},
|
||||
}) {
|
||||
std::unique_ptr<Index> index{index_factory(64, src)};
|
||||
ASSERT_TRUE(index);
|
||||
EXPECT_EQ(dst, reverse_index_factory(index.get()));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace faiss
|
||||
66
packages/leann-backend-hnsw/third_party/faiss/tests/test_fastscan_perf.cpp
vendored
Normal file
66
packages/leann-backend-hnsw/third_party/faiss/tests/test_fastscan_perf.cpp
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFPQFastScan.h>
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
|
||||
TEST(TestFastScan, knnVSrange) {
|
||||
// small vectors and database
|
||||
int d = 64;
|
||||
size_t nb = 4000;
|
||||
|
||||
// ivf centroids
|
||||
size_t nlist = 4;
|
||||
|
||||
// more than 2 threads to surface
|
||||
// problems related to multi-threading
|
||||
omp_set_num_threads(8);
|
||||
|
||||
// random database, also used as queries
|
||||
std::vector<float> database(nb * d);
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
|
||||
// build index
|
||||
faiss::IndexFlatL2 coarse_quantizer(d);
|
||||
faiss::IndexIVFPQFastScan index(
|
||||
&coarse_quantizer, d, nlist, d / 2, 4, faiss::METRIC_L2, 32);
|
||||
index.pq.cp.niter = 10; // speed up train
|
||||
index.nprobe = nlist;
|
||||
index.train(nb, database.data());
|
||||
index.add(nb, database.data());
|
||||
|
||||
std::vector<float> distances(nb);
|
||||
std::vector<faiss::idx_t> labels(nb);
|
||||
auto t = std::chrono::high_resolution_clock::now();
|
||||
index.search(nb, database.data(), 1, distances.data(), labels.data());
|
||||
auto knn_time = std::chrono::high_resolution_clock::now() - t;
|
||||
|
||||
faiss::RangeSearchResult rsr(nb);
|
||||
t = std::chrono::high_resolution_clock::now();
|
||||
index.range_search(nb, database.data(), 1.0, &rsr);
|
||||
auto range_time = std::chrono::high_resolution_clock::now() - t;
|
||||
|
||||
// we expect the perf of knn and range search
|
||||
// to be similar, at least within a factor of 4
|
||||
ASSERT_LE(range_time, knn_time * 4);
|
||||
ASSERT_LE(knn_time, range_time * 4);
|
||||
}
|
||||
335
packages/leann-backend-hnsw/third_party/faiss/tests/test_hamming.cpp
vendored
Normal file
335
packages/leann-backend-hnsw/third_party/faiss/tests/test_hamming.cpp
vendored
Normal file
@@ -0,0 +1,335 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include <random>
|
||||
|
||||
using namespace ::testing;
|
||||
|
||||
template <typename T>
|
||||
std::string print_data(
|
||||
std::shared_ptr<std::vector<T>> data,
|
||||
const size_t divider) {
|
||||
std::string ret = "";
|
||||
for (int i = 0; i < data->size(); ++i) {
|
||||
if (i % divider) {
|
||||
ret += " ";
|
||||
} else {
|
||||
ret += "|";
|
||||
}
|
||||
ret += std::to_string((*data)[i]);
|
||||
}
|
||||
ret += "|";
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::stringstream get_correct_hamming_example(
|
||||
const size_t na, // number of queries
|
||||
const size_t nb, // number of candidates
|
||||
const size_t k,
|
||||
const size_t code_size,
|
||||
std::shared_ptr<std::vector<uint8_t>> a,
|
||||
std::shared_ptr<std::vector<uint8_t>> b,
|
||||
std::shared_ptr<std::vector<long>> true_ids,
|
||||
// regular Hamming (bit-level distances)
|
||||
std::shared_ptr<std::vector<int>> true_bit_distances,
|
||||
// generalized Hamming (byte-level distances)
|
||||
std::shared_ptr<std::vector<int>> true_byte_distances) {
|
||||
assert(nb >= k);
|
||||
|
||||
// Initialization
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, nb - 1);
|
||||
|
||||
const size_t nresults = na * k;
|
||||
|
||||
a->clear();
|
||||
a->resize(na * code_size, 1); // query vectors are all 1
|
||||
b->clear();
|
||||
b->resize(nb * code_size, 2); // database vectors are all 2
|
||||
true_ids->clear();
|
||||
true_ids->reserve(nresults);
|
||||
true_bit_distances->clear();
|
||||
true_bit_distances->reserve(nresults);
|
||||
true_byte_distances->clear();
|
||||
true_byte_distances->reserve(nresults);
|
||||
|
||||
// define correct ids (must be unique)
|
||||
std::set<long> correct_ids;
|
||||
do {
|
||||
correct_ids.insert(uniform(rng));
|
||||
} while (correct_ids.size() < k);
|
||||
|
||||
// replace database vector at id with vector more similar to query
|
||||
// ordered, so earlier ids must be more similar
|
||||
for (size_t nmatches = k; nmatches > 0; --nmatches) {
|
||||
// get id and erase it
|
||||
const size_t id = *correct_ids.begin();
|
||||
*correct_ids.erase(correct_ids.begin());
|
||||
|
||||
// assemble true id and distance at locations
|
||||
true_ids->push_back(id);
|
||||
true_bit_distances->push_back(
|
||||
(code_size > nmatches ? code_size - nmatches : 0) *
|
||||
/* per-code distance between 1 and 2 (0b01 and 0b10) */
|
||||
2);
|
||||
true_byte_distances->push_back(
|
||||
(code_size > nmatches ? code_size - nmatches : 0));
|
||||
for (size_t i = 0; i < nmatches; ++i) {
|
||||
b->begin()[id * code_size + i] = 1; // query byte value
|
||||
}
|
||||
}
|
||||
|
||||
// true_ids, true_bit_distances, true_byte_distances only contain results
|
||||
// for the first query.
|
||||
// Query vectors are identical (all 1s), so copy the first sets of k
|
||||
// distances na-1 times.
|
||||
for (size_t i = 1; i < na; ++i) {
|
||||
true_ids->insert(
|
||||
true_ids->end(), true_ids->begin(), true_ids->begin() + k);
|
||||
true_bit_distances->insert(
|
||||
true_bit_distances->end(),
|
||||
true_bit_distances->begin(),
|
||||
true_bit_distances->begin() + k);
|
||||
true_byte_distances->insert(
|
||||
true_byte_distances->end(),
|
||||
true_byte_distances->begin(),
|
||||
true_byte_distances->begin() + k);
|
||||
}
|
||||
|
||||
// assemble string for debugging
|
||||
std::stringstream ret;
|
||||
ret << "na: " << na << std::endl
|
||||
<< "nb: " << nb << std::endl
|
||||
<< "k: " << k << std::endl
|
||||
<< "code_size: " << code_size << std::endl
|
||||
<< "a: " << print_data(a, code_size) << std::endl
|
||||
<< "b: " << print_data(b, code_size) << std::endl
|
||||
<< "true_ids: " << print_data(true_ids, k) << std::endl
|
||||
<< "true_bit_distances: " << print_data(true_bit_distances, k)
|
||||
<< std::endl
|
||||
<< "true_byte_distances: " << print_data(true_byte_distances, k)
|
||||
<< std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
TEST(TestHamming, test_crosshamming_count_thres) {
|
||||
// Initialize randomizer
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 255);
|
||||
|
||||
// Initialize inputs
|
||||
const size_t n = 10; // number of codes
|
||||
const hamdis_t hamming_threshold = 20;
|
||||
|
||||
// one for each case - 65 is default
|
||||
for (auto ncodes : {8, 16, 32, 64, 65}) {
|
||||
// initialize inputs
|
||||
const int nbits = ncodes * 8;
|
||||
const size_t nwords = nbits / 64;
|
||||
// 8 to for later conversion to uint64_t, and 2 for buffer
|
||||
std::vector<uint8_t> dbs(nwords * n * 8 * 2);
|
||||
for (int i = 0; i < dbs.size(); ++i) {
|
||||
dbs[i] = uniform(rng);
|
||||
}
|
||||
|
||||
// get true distance
|
||||
size_t true_count = 0;
|
||||
uint64_t* bs1 = (uint64_t*)dbs.data();
|
||||
for (int i = 0; i < n; ++i) {
|
||||
uint64_t* bs2 = bs1 + 2;
|
||||
for (int j = i + 1; j < n; ++j) {
|
||||
if (faiss::hamming(bs1 + i * nwords, bs2 + j * nwords, nwords) <
|
||||
hamming_threshold) {
|
||||
++true_count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// run test and check correctness
|
||||
size_t count;
|
||||
if (ncodes == 65) {
|
||||
ASSERT_THROW(
|
||||
faiss::crosshamming_count_thres(
|
||||
dbs.data(), n, hamming_threshold, ncodes, &count),
|
||||
faiss::FaissException);
|
||||
continue;
|
||||
}
|
||||
faiss::crosshamming_count_thres(
|
||||
dbs.data(), n, hamming_threshold, ncodes, &count);
|
||||
|
||||
ASSERT_EQ(count, true_count) << "ncodes = " << ncodes;
|
||||
}
|
||||
}
|
||||
TEST(TestHamming, test_hamming_thres) {
|
||||
// Initialize randomizer
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 255);
|
||||
|
||||
// Initialize inputs
|
||||
const size_t n1 = 10;
|
||||
const size_t n2 = 15;
|
||||
const hamdis_t hamming_threshold = 100;
|
||||
|
||||
// one for each case - 65 is default
|
||||
for (auto ncodes : {8, 16, 32, 64, 65}) {
|
||||
// initialize inputs
|
||||
const int nbits = ncodes * 8;
|
||||
const size_t nwords = nbits / 64;
|
||||
std::vector<uint8_t> bs1(nwords * n1 * 8);
|
||||
std::vector<uint8_t> bs2(nwords * n2 * 8);
|
||||
for (int i = 0; i < bs1.size(); ++i) {
|
||||
bs1[i] = uniform(rng);
|
||||
}
|
||||
for (int i = 0; i < bs2.size(); ++i) {
|
||||
bs2[i] = uniform(rng);
|
||||
}
|
||||
|
||||
// get true distance
|
||||
size_t true_count = 0;
|
||||
std::vector<int64_t> true_idx;
|
||||
std::vector<hamdis_t> true_dis;
|
||||
|
||||
uint64_t* bs1_64 = (uint64_t*)bs1.data();
|
||||
uint64_t* bs2_64 = (uint64_t*)bs2.data();
|
||||
for (int i = 0; i < n1; ++i) {
|
||||
for (int j = 0; j < n2; ++j) {
|
||||
hamdis_t ham_dist = faiss::hamming(
|
||||
bs1_64 + i * nwords, bs2_64 + j * nwords, nwords);
|
||||
if (ham_dist < hamming_threshold) {
|
||||
++true_count;
|
||||
true_idx.push_back(i);
|
||||
true_idx.push_back(j);
|
||||
true_dis.push_back(ham_dist);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// run test and check correctness for both
|
||||
// match_hamming_thres and hamming_count_thres
|
||||
std::vector<int64_t> idx(true_idx.size());
|
||||
std::vector<hamdis_t> dis(true_dis.size());
|
||||
if (ncodes == 65) {
|
||||
ASSERT_THROW(
|
||||
faiss::match_hamming_thres(
|
||||
bs1.data(),
|
||||
bs2.data(),
|
||||
n1,
|
||||
n2,
|
||||
hamming_threshold,
|
||||
ncodes,
|
||||
idx.data(),
|
||||
dis.data()),
|
||||
faiss::FaissException);
|
||||
ASSERT_THROW(
|
||||
faiss::hamming_count_thres(
|
||||
bs1.data(),
|
||||
bs2.data(),
|
||||
n1,
|
||||
n2,
|
||||
hamming_threshold,
|
||||
ncodes,
|
||||
nullptr),
|
||||
faiss::FaissException);
|
||||
continue;
|
||||
}
|
||||
size_t match_count = faiss::match_hamming_thres(
|
||||
bs1.data(),
|
||||
bs2.data(),
|
||||
n1,
|
||||
n2,
|
||||
hamming_threshold,
|
||||
ncodes,
|
||||
idx.data(),
|
||||
dis.data());
|
||||
size_t count_count;
|
||||
faiss::hamming_count_thres(
|
||||
bs1.data(),
|
||||
bs2.data(),
|
||||
n1,
|
||||
n2,
|
||||
hamming_threshold,
|
||||
ncodes,
|
||||
&count_count);
|
||||
|
||||
ASSERT_EQ(match_count, true_count) << "ncodes = " << ncodes;
|
||||
ASSERT_EQ(count_count, true_count) << "ncodes = " << ncodes;
|
||||
ASSERT_EQ(idx, true_idx) << "ncodes = " << ncodes;
|
||||
ASSERT_EQ(dis, true_dis) << "ncodes = " << ncodes;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestHamming, test_hamming_knn) {
|
||||
// Initialize randomizer
|
||||
std::default_random_engine rng(123);
|
||||
std::uniform_int_distribution<int32_t> uniform(0, 32);
|
||||
|
||||
// Initialize inputs
|
||||
const size_t na = 4;
|
||||
const size_t nb = 12; // number of candidates
|
||||
const size_t k = 6;
|
||||
|
||||
auto a = std::make_shared<std::vector<uint8_t>>();
|
||||
auto b = std::make_shared<std::vector<uint8_t>>();
|
||||
auto true_ids = std::make_shared<std::vector<long>>();
|
||||
auto true_bit_distances = std::make_shared<std::vector<int>>();
|
||||
auto true_byte_distances = std::make_shared<std::vector<int>>();
|
||||
|
||||
// 8, 16, 32 are cases - 24 will hit default case
|
||||
// all should be multiples of 8
|
||||
for (auto code_size : {8, 16, 24, 32}) {
|
||||
// get example
|
||||
std::stringstream assert_str = get_correct_hamming_example(
|
||||
na,
|
||||
nb,
|
||||
k,
|
||||
code_size,
|
||||
a,
|
||||
b,
|
||||
true_ids,
|
||||
true_bit_distances,
|
||||
true_byte_distances);
|
||||
|
||||
// run test on generalized_hammings_knn_hc
|
||||
std::vector<long> ids_gen(na * k);
|
||||
std::vector<int> dist_gen(na * k);
|
||||
faiss::int_maxheap_array_t res = {
|
||||
na, k, ids_gen.data(), dist_gen.data()};
|
||||
faiss::generalized_hammings_knn_hc(
|
||||
&res, a->data(), b->data(), nb, code_size, true);
|
||||
ASSERT_EQ(ids_gen, *true_ids) << assert_str.str();
|
||||
ASSERT_EQ(dist_gen, *true_byte_distances) << assert_str.str();
|
||||
|
||||
// run test on hammings_knn
|
||||
std::vector<long> ids_ham_knn(na * k, 0);
|
||||
std::vector<int> dist_ham_knn(na * k, 0);
|
||||
res = {na, k, ids_ham_knn.data(), dist_ham_knn.data()};
|
||||
faiss::hammings_knn(&res, a->data(), b->data(), nb, code_size, true);
|
||||
ASSERT_EQ(ids_ham_knn, *true_ids) << assert_str.str();
|
||||
ASSERT_EQ(dist_ham_knn, *true_bit_distances) << assert_str.str();
|
||||
}
|
||||
|
||||
for (auto code_size : {8, 16, 24, 32}) {
|
||||
std::stringstream assert_str = get_correct_hamming_example(
|
||||
na,
|
||||
nb,
|
||||
/* k */ nb, // faiss::hammings computes all distances
|
||||
code_size,
|
||||
a,
|
||||
b,
|
||||
true_ids,
|
||||
true_bit_distances,
|
||||
true_byte_distances);
|
||||
std::vector<hamdis_t> dist_gen(na * nb);
|
||||
faiss::hammings(
|
||||
a->data(), b->data(), na, nb, code_size, dist_gen.data());
|
||||
EXPECT_EQ(dist_gen, *true_bit_distances) << assert_str.str();
|
||||
}
|
||||
}
|
||||
54
packages/leann-backend-hnsw/third_party/faiss/tests/test_heap.cpp
vendored
Normal file
54
packages/leann-backend-hnsw/third_party/faiss/tests/test_heap.cpp
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/utils/Heap.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
TEST(Heap, addn_with_ids) {
|
||||
size_t n = 1000;
|
||||
size_t k = 1;
|
||||
std::vector<int64_t> heap_labels(n, -1);
|
||||
std::vector<float> heap_distances(n, 0);
|
||||
float_minheap_array_t heaps = {
|
||||
n, k, heap_labels.data(), heap_distances.data()};
|
||||
heaps.heapify();
|
||||
std::vector<int64_t> labels(n, 1);
|
||||
std::vector<float> distances(n, 0.0f);
|
||||
std::vector<int64_t> subset(n);
|
||||
std::iota(subset.begin(), subset.end(), 0);
|
||||
heaps.addn_with_ids(1, distances.data(), labels.data(), 1);
|
||||
heaps.reorder();
|
||||
EXPECT_TRUE(
|
||||
std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
|
||||
return i == 1;
|
||||
}));
|
||||
}
|
||||
|
||||
TEST(Heap, addn_query_subset_with_ids) {
|
||||
size_t n = 20000000; // more than 2^24
|
||||
size_t k = 1;
|
||||
std::vector<int64_t> heap_labels(n, -1);
|
||||
std::vector<float> heap_distances(n, 0);
|
||||
float_minheap_array_t heaps = {
|
||||
n, k, heap_labels.data(), heap_distances.data()};
|
||||
heaps.heapify();
|
||||
std::vector<int64_t> labels(n, 1);
|
||||
std::vector<float> distances(n, 0.0f);
|
||||
std::vector<int64_t> subset(n);
|
||||
std::iota(subset.begin(), subset.end(), 0);
|
||||
heaps.addn_query_subset_with_ids(
|
||||
n, subset.data(), 1, distances.data(), labels.data(), 1);
|
||||
heaps.reorder();
|
||||
EXPECT_TRUE(
|
||||
std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
|
||||
return i == 1;
|
||||
}));
|
||||
}
|
||||
657
packages/leann-backend-hnsw/third_party/faiss/tests/test_hnsw.cpp
vendored
Normal file
657
packages/leann-backend-hnsw/third_party/faiss/tests/test_hnsw.cpp
vendored
Normal file
@@ -0,0 +1,657 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/IndexHNSW.h>
|
||||
#include <faiss/impl/HNSW.h>
|
||||
#include <faiss/impl/ResultHandler.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
int reference_pop_min(faiss::HNSW::MinimaxHeap& heap, float* vmin_out) {
|
||||
assert(heap.k > 0);
|
||||
// returns min. This is an O(n) operation
|
||||
int i = heap.k - 1;
|
||||
while (i >= 0) {
|
||||
if (heap.ids[i] != -1)
|
||||
break;
|
||||
i--;
|
||||
}
|
||||
if (i == -1)
|
||||
return -1;
|
||||
int imin = i;
|
||||
float vmin = heap.dis[i];
|
||||
i--;
|
||||
while (i >= 0) {
|
||||
if (heap.ids[i] != -1 && heap.dis[i] < vmin) {
|
||||
vmin = heap.dis[i];
|
||||
imin = i;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
if (vmin_out)
|
||||
*vmin_out = vmin;
|
||||
int ret = heap.ids[imin];
|
||||
heap.ids[imin] = -1;
|
||||
--heap.nvalid;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void test_popmin(int heap_size, int amount_to_put) {
|
||||
// create a heap
|
||||
faiss::HNSW::MinimaxHeap mm_heap(heap_size);
|
||||
|
||||
using storage_idx_t = faiss::HNSW::storage_idx_t;
|
||||
|
||||
std::default_random_engine rng(123 + heap_size * amount_to_put);
|
||||
std::uniform_int_distribution<storage_idx_t> u(0, 65536);
|
||||
std::uniform_real_distribution<float> uf(0, 1);
|
||||
|
||||
// generate random unique indices
|
||||
std::unordered_set<storage_idx_t> indices;
|
||||
while (indices.size() < amount_to_put) {
|
||||
const storage_idx_t index = u(rng);
|
||||
indices.insert(index);
|
||||
}
|
||||
|
||||
// put ones into the heap
|
||||
for (const auto index : indices) {
|
||||
float distance = uf(rng);
|
||||
if (distance >= 0.7f) {
|
||||
// add infinity values from time to time
|
||||
distance = std::numeric_limits<float>::infinity();
|
||||
}
|
||||
mm_heap.push(index, distance);
|
||||
}
|
||||
|
||||
// clone the heap
|
||||
faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
|
||||
|
||||
// takes ones out one by one
|
||||
while (mm_heap.size() > 0) {
|
||||
// compare heaps
|
||||
ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
|
||||
ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
|
||||
ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
|
||||
ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
|
||||
ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
|
||||
|
||||
// use the reference pop_min for the cloned heap
|
||||
float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
|
||||
storage_idx_t cloned_vmin_idx =
|
||||
reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
|
||||
|
||||
float vmin_dis = std::numeric_limits<float>::quiet_NaN();
|
||||
storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
|
||||
|
||||
// compare returns
|
||||
ASSERT_EQ(vmin_dis, cloned_vmin_dis);
|
||||
ASSERT_EQ(vmin_idx, cloned_vmin_idx);
|
||||
}
|
||||
|
||||
// compare heaps again
|
||||
ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
|
||||
ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
|
||||
ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
|
||||
ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
|
||||
ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
|
||||
}
|
||||
|
||||
void test_popmin_identical_distances(
|
||||
int heap_size,
|
||||
int amount_to_put,
|
||||
const float distance) {
|
||||
// create a heap
|
||||
faiss::HNSW::MinimaxHeap mm_heap(heap_size);
|
||||
|
||||
using storage_idx_t = faiss::HNSW::storage_idx_t;
|
||||
|
||||
std::default_random_engine rng(123 + heap_size * amount_to_put);
|
||||
std::uniform_int_distribution<storage_idx_t> u(0, 65536);
|
||||
|
||||
// generate random unique indices
|
||||
std::unordered_set<storage_idx_t> indices;
|
||||
while (indices.size() < amount_to_put) {
|
||||
const storage_idx_t index = u(rng);
|
||||
indices.insert(index);
|
||||
}
|
||||
|
||||
// put ones into the heap
|
||||
for (const auto index : indices) {
|
||||
mm_heap.push(index, distance);
|
||||
}
|
||||
|
||||
// clone the heap
|
||||
faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
|
||||
|
||||
// takes ones out one by one
|
||||
while (mm_heap.size() > 0) {
|
||||
// compare heaps
|
||||
ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
|
||||
ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
|
||||
ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
|
||||
ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
|
||||
ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
|
||||
|
||||
// use the reference pop_min for the cloned heap
|
||||
float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
|
||||
storage_idx_t cloned_vmin_idx =
|
||||
reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
|
||||
|
||||
float vmin_dis = std::numeric_limits<float>::quiet_NaN();
|
||||
storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
|
||||
|
||||
// compare returns
|
||||
ASSERT_EQ(vmin_dis, cloned_vmin_dis);
|
||||
ASSERT_EQ(vmin_idx, cloned_vmin_idx);
|
||||
}
|
||||
|
||||
// compare heaps again
|
||||
ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
|
||||
ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
|
||||
ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
|
||||
ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
|
||||
ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
|
||||
}
|
||||
|
||||
TEST(HNSW, Test_popmin) {
|
||||
std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32, 64, 128};
|
||||
for (const size_t size : sizes) {
|
||||
for (size_t amount = size; amount > 0; amount /= 2) {
|
||||
test_popmin(size, amount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HNSW, Test_popmin_identical_distances) {
|
||||
std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
|
||||
for (const size_t size : sizes) {
|
||||
for (size_t amount = size; amount > 0; amount /= 2) {
|
||||
test_popmin_identical_distances(size, amount, 1.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HNSW, Test_popmin_infinite_distances) {
|
||||
std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
|
||||
for (const size_t size : sizes) {
|
||||
for (size_t amount = size; amount > 0; amount /= 2) {
|
||||
test_popmin_identical_distances(
|
||||
size, amount, std::numeric_limits<float>::infinity());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HNSW, Test_IndexHNSW_METRIC_Lp) {
|
||||
// Create an HNSW index with METRIC_Lp and metric_arg = 3
|
||||
faiss::IndexFlat storage_index(1, faiss::METRIC_Lp);
|
||||
storage_index.metric_arg = 3;
|
||||
faiss::IndexHNSW index(&storage_index, 32);
|
||||
|
||||
// Add a single data point
|
||||
float data[1] = {0.0};
|
||||
index.add(1, data);
|
||||
|
||||
// Prepare a query
|
||||
float query[1] = {2.0};
|
||||
float distance;
|
||||
faiss::idx_t label;
|
||||
|
||||
index.search(1, query, 1, &distance, &label);
|
||||
|
||||
EXPECT_NEAR(distance, 8.0, 1e-5); // Distance should be 8.0 (2^3)
|
||||
EXPECT_EQ(label, 0); // Label should be 0
|
||||
}
|
||||
|
||||
class HNSWTest : public testing::Test {
|
||||
protected:
|
||||
HNSWTest() {
|
||||
xb = std::make_unique<std::vector<float>>(d * nb);
|
||||
xb->reserve(d * nb);
|
||||
faiss::float_rand(xb->data(), d * nb, 12345);
|
||||
index = std::make_unique<faiss::IndexHNSWFlat>(d, M);
|
||||
index->add(nb, xb->data());
|
||||
xq = std::unique_ptr<std::vector<float>>(
|
||||
new std::vector<float>(d * nq));
|
||||
xq->reserve(d * nq);
|
||||
faiss::float_rand(xq->data(), d * nq, 12345);
|
||||
dis = std::unique_ptr<faiss::DistanceComputer>(
|
||||
index->storage->get_distance_computer());
|
||||
dis->set_query(xq->data() + 0 * index->d);
|
||||
}
|
||||
|
||||
const int d = 64;
|
||||
const int nb = 2000;
|
||||
const int M = 4;
|
||||
const int nq = 10;
|
||||
const int k = 10;
|
||||
std::unique_ptr<std::vector<float>> xb;
|
||||
std::unique_ptr<std::vector<float>> xq;
|
||||
std::unique_ptr<faiss::DistanceComputer> dis;
|
||||
std::unique_ptr<faiss::IndexHNSWFlat> index;
|
||||
};
|
||||
|
||||
/** Do a BFS on the candidates list */
|
||||
int reference_search_from_candidates(
|
||||
const faiss::HNSW& hnsw,
|
||||
faiss::DistanceComputer& qdis,
|
||||
faiss::ResultHandler<faiss::HNSW::C>& res,
|
||||
faiss::HNSW::MinimaxHeap& candidates,
|
||||
faiss::VisitedTable& vt,
|
||||
faiss::HNSWStats& stats,
|
||||
int level,
|
||||
int nres_in,
|
||||
const faiss::SearchParametersHNSW* params) {
|
||||
int nres = nres_in;
|
||||
int ndis = 0;
|
||||
|
||||
// can be overridden by search params
|
||||
bool do_dis_check = params ? params->check_relative_distance
|
||||
: hnsw.check_relative_distance;
|
||||
int efSearch = params ? params->efSearch : hnsw.efSearch;
|
||||
const faiss::IDSelector* sel = params ? params->sel : nullptr;
|
||||
|
||||
faiss::HNSW::C::T threshold = res.threshold;
|
||||
for (int i = 0; i < candidates.size(); i++) {
|
||||
faiss::idx_t v1 = candidates.ids[i];
|
||||
float d = candidates.dis[i];
|
||||
FAISS_ASSERT(v1 >= 0);
|
||||
if (!sel || sel->is_member(v1)) {
|
||||
if (d < threshold) {
|
||||
if (res.add_result(d, v1)) {
|
||||
threshold = res.threshold;
|
||||
}
|
||||
}
|
||||
}
|
||||
vt.set(v1);
|
||||
}
|
||||
|
||||
int nstep = 0;
|
||||
|
||||
while (candidates.size() > 0) {
|
||||
float d0 = 0;
|
||||
int v0 = candidates.pop_min(&d0);
|
||||
|
||||
if (do_dis_check) {
|
||||
// tricky stopping condition: there are more that ef
|
||||
// distances that are processed already that are smaller
|
||||
// than d0
|
||||
|
||||
int n_dis_below = candidates.count_below(d0);
|
||||
if (n_dis_below >= efSearch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
size_t begin, end;
|
||||
hnsw.neighbor_range(v0, level, &begin, &end);
|
||||
|
||||
// a reference version
|
||||
for (size_t j = begin; j < end; j++) {
|
||||
int v1 = hnsw.neighbors[j];
|
||||
if (v1 < 0)
|
||||
break;
|
||||
if (vt.get(v1)) {
|
||||
continue;
|
||||
}
|
||||
vt.set(v1);
|
||||
ndis++;
|
||||
float d = qdis(v1);
|
||||
if (!sel || sel->is_member(v1)) {
|
||||
if (d < threshold) {
|
||||
if (res.add_result(d, v1)) {
|
||||
threshold = res.threshold;
|
||||
nres += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
candidates.push(v1, d);
|
||||
}
|
||||
|
||||
nstep++;
|
||||
if (!do_dis_check && nstep > efSearch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (level == 0) {
|
||||
stats.n1++;
|
||||
if (candidates.size() == 0) {
|
||||
stats.n2++;
|
||||
}
|
||||
stats.ndis += ndis;
|
||||
stats.nhops += nstep;
|
||||
}
|
||||
|
||||
return nres;
|
||||
}
|
||||
|
||||
faiss::HNSWStats reference_greedy_update_nearest(
|
||||
const faiss::HNSW& hnsw,
|
||||
faiss::DistanceComputer& qdis,
|
||||
int level,
|
||||
faiss::HNSW::storage_idx_t& nearest,
|
||||
float& d_nearest) {
|
||||
faiss::HNSWStats stats;
|
||||
|
||||
for (;;) {
|
||||
faiss::HNSW::storage_idx_t prev_nearest = nearest;
|
||||
|
||||
size_t begin, end;
|
||||
hnsw.neighbor_range(nearest, level, &begin, &end);
|
||||
|
||||
size_t ndis = 0;
|
||||
|
||||
for (size_t i = begin; i < end; i++) {
|
||||
faiss::HNSW::storage_idx_t v = hnsw.neighbors[i];
|
||||
if (v < 0)
|
||||
break;
|
||||
ndis += 1;
|
||||
float dis = qdis(v);
|
||||
if (dis < d_nearest) {
|
||||
nearest = v;
|
||||
d_nearest = dis;
|
||||
}
|
||||
}
|
||||
// update stats
|
||||
stats.ndis += ndis;
|
||||
stats.nhops += 1;
|
||||
|
||||
if (nearest == prev_nearest) {
|
||||
return stats;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::priority_queue<faiss::HNSW::Node> reference_search_from_candidate_unbounded(
|
||||
const faiss::HNSW& hnsw,
|
||||
const faiss::HNSW::Node& node,
|
||||
faiss::DistanceComputer& qdis,
|
||||
int ef,
|
||||
faiss::VisitedTable* vt,
|
||||
faiss::HNSWStats& stats) {
|
||||
int ndis = 0;
|
||||
std::priority_queue<faiss::HNSW::Node> top_candidates;
|
||||
std::priority_queue<
|
||||
faiss::HNSW::Node,
|
||||
std::vector<faiss::HNSW::Node>,
|
||||
std::greater<faiss::HNSW::Node>>
|
||||
candidates;
|
||||
|
||||
top_candidates.push(node);
|
||||
candidates.push(node);
|
||||
|
||||
vt->set(node.second);
|
||||
|
||||
while (!candidates.empty()) {
|
||||
float d0;
|
||||
faiss::HNSW::storage_idx_t v0;
|
||||
std::tie(d0, v0) = candidates.top();
|
||||
|
||||
if (d0 > top_candidates.top().first) {
|
||||
break;
|
||||
}
|
||||
|
||||
candidates.pop();
|
||||
|
||||
size_t begin, end;
|
||||
hnsw.neighbor_range(v0, 0, &begin, &end);
|
||||
|
||||
for (size_t j = begin; j < end; ++j) {
|
||||
int v1 = hnsw.neighbors[j];
|
||||
|
||||
if (v1 < 0) {
|
||||
break;
|
||||
}
|
||||
if (vt->get(v1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
vt->set(v1);
|
||||
|
||||
float d1 = qdis(v1);
|
||||
++ndis;
|
||||
|
||||
if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
|
||||
candidates.emplace(d1, v1);
|
||||
top_candidates.emplace(d1, v1);
|
||||
|
||||
if (top_candidates.size() > ef) {
|
||||
top_candidates.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stats.nhops += 1;
|
||||
}
|
||||
|
||||
++stats.n1;
|
||||
if (candidates.size() == 0) {
|
||||
++stats.n2;
|
||||
}
|
||||
stats.ndis += ndis;
|
||||
|
||||
return top_candidates;
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_search_from_candidate_unbounded) {
|
||||
omp_set_num_threads(1);
|
||||
auto nearest = index->hnsw.entry_point;
|
||||
float d_nearest = (*dis)(nearest);
|
||||
auto node = faiss::HNSW::Node(d_nearest, nearest);
|
||||
faiss::VisitedTable vt(index->ntotal);
|
||||
faiss::HNSWStats stats;
|
||||
|
||||
// actual version
|
||||
auto top_candidates = faiss::search_from_candidate_unbounded(
|
||||
index->hnsw, node, *dis, k, &vt, stats);
|
||||
|
||||
auto reference_nearest = index->hnsw.entry_point;
|
||||
float reference_d_nearest = (*dis)(nearest);
|
||||
auto reference_node =
|
||||
faiss::HNSW::Node(reference_d_nearest, reference_nearest);
|
||||
faiss::VisitedTable reference_vt(index->ntotal);
|
||||
faiss::HNSWStats reference_stats;
|
||||
|
||||
// reference version
|
||||
auto reference_top_candidates = reference_search_from_candidate_unbounded(
|
||||
index->hnsw,
|
||||
reference_node,
|
||||
*dis,
|
||||
k,
|
||||
&reference_vt,
|
||||
reference_stats);
|
||||
EXPECT_EQ(stats.ndis, reference_stats.ndis);
|
||||
EXPECT_EQ(stats.nhops, reference_stats.nhops);
|
||||
EXPECT_EQ(stats.n1, reference_stats.n1);
|
||||
EXPECT_EQ(stats.n2, reference_stats.n2);
|
||||
EXPECT_EQ(top_candidates.size(), reference_top_candidates.size());
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_greedy_update_nearest) {
|
||||
omp_set_num_threads(1);
|
||||
|
||||
auto nearest = index->hnsw.entry_point;
|
||||
float d_nearest = (*dis)(nearest);
|
||||
auto reference_nearest = index->hnsw.entry_point;
|
||||
float reference_d_nearest = (*dis)(reference_nearest);
|
||||
|
||||
// actual version
|
||||
auto stats = faiss::greedy_update_nearest(
|
||||
index->hnsw, *dis, 0, nearest, d_nearest);
|
||||
|
||||
// reference version
|
||||
auto reference_stats = reference_greedy_update_nearest(
|
||||
index->hnsw, *dis, 0, reference_nearest, reference_d_nearest);
|
||||
EXPECT_EQ(stats.ndis, reference_stats.ndis);
|
||||
EXPECT_EQ(stats.nhops, reference_stats.nhops);
|
||||
EXPECT_EQ(stats.n1, reference_stats.n1);
|
||||
EXPECT_EQ(stats.n2, reference_stats.n2);
|
||||
EXPECT_NEAR(d_nearest, reference_d_nearest, 0.01);
|
||||
EXPECT_EQ(nearest, reference_nearest);
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_search_from_candidates) {
|
||||
omp_set_num_threads(1);
|
||||
|
||||
std::vector<faiss::idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<faiss::idx_t> reference_I(k * nq);
|
||||
std::vector<float> reference_D(k * nq);
|
||||
using RH = faiss::HeapBlockResultHandler<faiss::HNSW::C>;
|
||||
|
||||
faiss::VisitedTable vt(index->ntotal);
|
||||
faiss::VisitedTable reference_vt(index->ntotal);
|
||||
int num_candidates = 10;
|
||||
faiss::HNSW::MinimaxHeap candidates(num_candidates);
|
||||
faiss::HNSW::MinimaxHeap reference_candidates(num_candidates);
|
||||
|
||||
for (int i = 0; i < num_candidates; i++) {
|
||||
vt.set(i);
|
||||
reference_vt.set(i);
|
||||
candidates.push(i, (*dis)(i));
|
||||
reference_candidates.push(i, (*dis)(i));
|
||||
}
|
||||
|
||||
faiss::HNSWStats stats;
|
||||
RH bres(nq, D.data(), I.data(), k);
|
||||
faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res(
|
||||
bres);
|
||||
|
||||
res.begin(0);
|
||||
faiss::search_from_candidates(
|
||||
index->hnsw, *dis, res, candidates, vt, stats, 0, 0, nullptr);
|
||||
res.end();
|
||||
|
||||
faiss::HNSWStats reference_stats;
|
||||
RH reference_bres(nq, reference_D.data(), reference_I.data(), k);
|
||||
faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler
|
||||
reference_res(reference_bres);
|
||||
reference_res.begin(0);
|
||||
reference_search_from_candidates(
|
||||
index->hnsw,
|
||||
*dis,
|
||||
reference_res,
|
||||
reference_candidates,
|
||||
reference_vt,
|
||||
reference_stats,
|
||||
0,
|
||||
0,
|
||||
nullptr);
|
||||
reference_res.end();
|
||||
for (int i = 0; i < nq; i++) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
EXPECT_NEAR(I[i * k + j], reference_I[i * k + j], 0.1);
|
||||
EXPECT_NEAR(D[i * k + j], reference_D[i * k + j], 0.1);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(reference_stats.ndis, stats.ndis);
|
||||
EXPECT_EQ(reference_stats.nhops, stats.nhops);
|
||||
EXPECT_EQ(reference_stats.n1, stats.n1);
|
||||
EXPECT_EQ(reference_stats.n2, stats.n2);
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_search_neighbors_to_add) {
|
||||
omp_set_num_threads(1);
|
||||
|
||||
faiss::VisitedTable vt(index->ntotal);
|
||||
faiss::VisitedTable reference_vt(index->ntotal);
|
||||
|
||||
std::priority_queue<faiss::HNSW::NodeDistCloser> link_targets;
|
||||
std::priority_queue<faiss::HNSW::NodeDistCloser> reference_link_targets;
|
||||
|
||||
faiss::search_neighbors_to_add(
|
||||
index->hnsw,
|
||||
*dis,
|
||||
link_targets,
|
||||
index->hnsw.entry_point,
|
||||
(*dis)(index->hnsw.entry_point),
|
||||
index->hnsw.max_level,
|
||||
vt,
|
||||
false);
|
||||
|
||||
faiss::search_neighbors_to_add(
|
||||
index->hnsw,
|
||||
*dis,
|
||||
reference_link_targets,
|
||||
index->hnsw.entry_point,
|
||||
(*dis)(index->hnsw.entry_point),
|
||||
index->hnsw.max_level,
|
||||
reference_vt,
|
||||
true);
|
||||
|
||||
EXPECT_EQ(link_targets.size(), reference_link_targets.size());
|
||||
while (!link_targets.empty()) {
|
||||
auto val = link_targets.top();
|
||||
auto reference_val = reference_link_targets.top();
|
||||
EXPECT_EQ(val.d, reference_val.d);
|
||||
EXPECT_EQ(val.id, reference_val.id);
|
||||
link_targets.pop();
|
||||
reference_link_targets.pop();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_nb_neighbors_bound) {
|
||||
omp_set_num_threads(1);
|
||||
EXPECT_EQ(index->hnsw.nb_neighbors(0), 8);
|
||||
EXPECT_EQ(index->hnsw.nb_neighbors(1), 4);
|
||||
EXPECT_EQ(index->hnsw.nb_neighbors(2), 4);
|
||||
EXPECT_EQ(index->hnsw.nb_neighbors(3), 4);
|
||||
// picking a large number to trigger an exception based on checking bounds
|
||||
EXPECT_THROW(index->hnsw.nb_neighbors(100), faiss::FaissException);
|
||||
}
|
||||
|
||||
TEST_F(HNSWTest, TEST_search_level_0) {
|
||||
omp_set_num_threads(1);
|
||||
std::vector<faiss::idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
|
||||
using RH = faiss::HeapBlockResultHandler<faiss::HNSW::C>;
|
||||
RH bres1(nq, D.data(), I.data(), k);
|
||||
faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res1(
|
||||
bres1);
|
||||
RH bres2(nq, D.data(), I.data(), k);
|
||||
faiss::HeapBlockResultHandler<faiss::HNSW::C>::SingleResultHandler res2(
|
||||
bres2);
|
||||
|
||||
faiss::HNSWStats stats1, stats2;
|
||||
faiss::VisitedTable vt1(index->ntotal);
|
||||
faiss::VisitedTable vt2(index->ntotal);
|
||||
auto nprobe = 5;
|
||||
const faiss::HNSW::storage_idx_t values[] = {1, 2, 3, 4, 5};
|
||||
const faiss::HNSW::storage_idx_t* nearest_i = values;
|
||||
const float distances[] = {0.1, 0.2, 0.3, 0.4, 0.5};
|
||||
const float* nearest_d = distances;
|
||||
|
||||
// search_type == 1
|
||||
res1.begin(0);
|
||||
index->hnsw.search_level_0(
|
||||
*dis, res1, nprobe, nearest_i, nearest_d, 1, stats1, vt1, nullptr);
|
||||
res1.end();
|
||||
|
||||
// search_type == 2
|
||||
res2.begin(0);
|
||||
index->hnsw.search_level_0(
|
||||
*dis, res2, nprobe, nearest_i, nearest_d, 2, stats2, vt2, nullptr);
|
||||
res2.end();
|
||||
|
||||
// search_type 1 calls search_from_candidates in a loop nprobe times.
|
||||
// search_type 2 pushes the candidates and just calls search_from_candidates
|
||||
// once, so those stats will be much less.
|
||||
EXPECT_GT(stats1.ndis, stats2.ndis);
|
||||
EXPECT_GT(stats1.nhops, stats2.nhops);
|
||||
EXPECT_GT(stats1.n1, stats2.n1);
|
||||
EXPECT_GT(stats1.n2, stats2.n2);
|
||||
}
|
||||
254
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivf_index.cpp
vendored
Normal file
254
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivf_index.cpp
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <omp.h>
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <map>
|
||||
#include <random>
|
||||
#include <set>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// stores all ivf lists, used to verify the context
|
||||
// object is passed to the iterator
|
||||
class TestContext {
|
||||
public:
|
||||
TestContext() {}
|
||||
|
||||
void save_code(size_t list_no, const uint8_t* code, size_t code_size) {
|
||||
list_nos.emplace(id, list_no);
|
||||
codes.emplace(id, std::vector<uint8_t>(code_size));
|
||||
for (size_t i = 0; i < code_size; i++) {
|
||||
codes[id][i] = code[i];
|
||||
}
|
||||
id++;
|
||||
}
|
||||
|
||||
// id to codes map
|
||||
std::unordered_map<faiss::idx_t, std::vector<uint8_t>> codes;
|
||||
// id to list_no map
|
||||
std::unordered_map<faiss::idx_t, size_t> list_nos;
|
||||
faiss::idx_t id = 0;
|
||||
std::set<size_t> lists_probed;
|
||||
};
|
||||
|
||||
// the iterator that iterates over the codes stored in context object
|
||||
class TestInvertedListIterator : public faiss::InvertedListsIterator {
|
||||
public:
|
||||
TestInvertedListIterator(size_t list_no, TestContext* context)
|
||||
: list_no{list_no}, context{context} {
|
||||
it = context->codes.cbegin();
|
||||
seek_next();
|
||||
}
|
||||
~TestInvertedListIterator() override {}
|
||||
|
||||
// move the cursor to the first valid entry
|
||||
void seek_next() {
|
||||
while (it != context->codes.cend() &&
|
||||
context->list_nos[it->first] != list_no) {
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool is_available() const override {
|
||||
return it != context->codes.cend();
|
||||
}
|
||||
|
||||
virtual void next() override {
|
||||
it++;
|
||||
seek_next();
|
||||
}
|
||||
|
||||
virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes()
|
||||
override {
|
||||
if (it == context->codes.cend()) {
|
||||
FAISS_THROW_MSG("invalid state");
|
||||
}
|
||||
return std::make_pair(it->first, it->second.data());
|
||||
}
|
||||
|
||||
private:
|
||||
size_t list_no;
|
||||
TestContext* context;
|
||||
decltype(context->codes.cbegin()) it;
|
||||
};
|
||||
|
||||
class TestInvertedLists : public faiss::InvertedLists {
|
||||
public:
|
||||
TestInvertedLists(size_t nlist, size_t code_size)
|
||||
: faiss::InvertedLists(nlist, code_size) {
|
||||
use_iterator = true;
|
||||
}
|
||||
|
||||
~TestInvertedLists() override {}
|
||||
size_t list_size(size_t /*list_no*/) const override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
|
||||
faiss::InvertedListsIterator* get_iterator(size_t list_no, void* context)
|
||||
const override {
|
||||
auto testContext = (TestContext*)context;
|
||||
testContext->lists_probed.insert(list_no);
|
||||
return new TestInvertedListIterator(list_no, testContext);
|
||||
}
|
||||
|
||||
const uint8_t* get_codes(size_t /* list_no */) const override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
|
||||
const faiss::idx_t* get_ids(size_t /* list_no */) const override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
|
||||
// store the codes in context object
|
||||
size_t add_entry(
|
||||
size_t list_no,
|
||||
faiss::idx_t /*theid*/,
|
||||
const uint8_t* code,
|
||||
void* context) override {
|
||||
auto testContext = (TestContext*)context;
|
||||
testContext->save_code(list_no, code, code_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t add_entries(
|
||||
size_t /*list_no*/,
|
||||
size_t /*n_entry*/,
|
||||
const faiss::idx_t* /*ids*/,
|
||||
const uint8_t* /*code*/) override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
|
||||
void update_entries(
|
||||
size_t /*list_no*/,
|
||||
size_t /*offset*/,
|
||||
size_t /*n_entry*/,
|
||||
const faiss::idx_t* /*ids*/,
|
||||
const uint8_t* /*code*/) override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
|
||||
void resize(size_t /*list_no*/, size_t /*new_size*/) override {
|
||||
FAISS_THROW_MSG("unexpected call");
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST(IVF, list_context) {
|
||||
// this test verifies that the context object is passed
|
||||
// to the InvertedListsIterator and InvertedLists::add_entry.
|
||||
// the test InvertedLists and InvertedListsIterator reads/writes
|
||||
// to the test context object.
|
||||
// the test verifies the context object is modified as expected.
|
||||
|
||||
constexpr int d = 32; // dimension
|
||||
constexpr int nb = 100000; // database size
|
||||
constexpr int nlist = 100;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
// disable parallism, or we need to make Context object
|
||||
// thread-safe
|
||||
omp_set_num_threads(1);
|
||||
|
||||
faiss::IndexFlatL2 quantizer(d); // the other index
|
||||
faiss::IndexIVFFlat index(&quantizer, d, nlist);
|
||||
TestInvertedLists inverted_lists(nlist, index.code_size);
|
||||
index.replace_invlists(&inverted_lists);
|
||||
{
|
||||
// training
|
||||
constexpr size_t nt = 1500; // nb of training vectors
|
||||
std::vector<float> trainvecs(nt * d);
|
||||
for (size_t i = 0; i < nt * d; i++) {
|
||||
trainvecs[i] = distrib(rng);
|
||||
}
|
||||
index.verbose = true;
|
||||
index.train(nt, trainvecs.data());
|
||||
}
|
||||
TestContext context;
|
||||
std::vector<float> query_vector;
|
||||
constexpr faiss::idx_t query_vector_id = 100;
|
||||
{
|
||||
// populating the database
|
||||
std::vector<float> database(nb * d);
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
// populate the query vector
|
||||
if (i >= query_vector_id * d && i < query_vector_id * d + d) {
|
||||
query_vector.push_back(database[i]);
|
||||
}
|
||||
}
|
||||
std::vector<faiss::idx_t> coarse_idx(nb);
|
||||
index.quantizer->assign(nb, database.data(), coarse_idx.data());
|
||||
// pass dummy ids, the acutal ids are assigned in TextContext object
|
||||
std::vector<faiss::idx_t> xids(nb, 42);
|
||||
index.add_core(
|
||||
nb, database.data(), xids.data(), coarse_idx.data(), &context);
|
||||
|
||||
// check the context object get updated
|
||||
EXPECT_EQ(nb, context.id) << "should have added all ids";
|
||||
EXPECT_EQ(nb, context.codes.size())
|
||||
<< "should have correct number of codes";
|
||||
EXPECT_EQ(nb, context.list_nos.size())
|
||||
<< "should have correct number of list numbers";
|
||||
}
|
||||
{
|
||||
constexpr size_t num_vecs = 5; // number of vectors
|
||||
std::vector<float> vecs(num_vecs * d);
|
||||
for (size_t i = 0; i < num_vecs * d; i++) {
|
||||
vecs[i] = distrib(rng);
|
||||
}
|
||||
const size_t codeSize = index.sa_code_size();
|
||||
std::vector<uint8_t> encodedData(num_vecs * codeSize);
|
||||
index.sa_encode(num_vecs, vecs.data(), encodedData.data());
|
||||
std::vector<float> decodedVecs(num_vecs * d);
|
||||
index.sa_decode(num_vecs, encodedData.data(), decodedVecs.data());
|
||||
EXPECT_EQ(vecs, decodedVecs)
|
||||
<< "decoded vectors should be the same as the original vectors that were encoded";
|
||||
}
|
||||
{
|
||||
constexpr faiss::idx_t k = 100;
|
||||
constexpr size_t nprobe = 10;
|
||||
std::vector<float> distances(k);
|
||||
std::vector<faiss::idx_t> labels(k);
|
||||
faiss::SearchParametersIVF params;
|
||||
params.inverted_list_context = &context;
|
||||
params.nprobe = nprobe;
|
||||
index.search(
|
||||
1,
|
||||
query_vector.data(),
|
||||
k,
|
||||
distances.data(),
|
||||
labels.data(),
|
||||
¶ms);
|
||||
EXPECT_EQ(nprobe, context.lists_probed.size())
|
||||
<< "should probe nprobe lists";
|
||||
|
||||
// check the result contains the query vector, the probablity of
|
||||
// this fail should be low
|
||||
auto query_vector_listno = context.list_nos[query_vector_id];
|
||||
auto& lists_probed = context.lists_probed;
|
||||
EXPECT_TRUE(
|
||||
std::find(
|
||||
lists_probed.cbegin(),
|
||||
lists_probed.cend(),
|
||||
query_vector_listno) != lists_probed.cend())
|
||||
<< "should probe the list of the query vector";
|
||||
EXPECT_TRUE(
|
||||
std::find(labels.cbegin(), labels.cend(), query_vector_id) !=
|
||||
labels.cend())
|
||||
<< "should return the query vector";
|
||||
}
|
||||
}
|
||||
85
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_codec.cpp
vendored
Normal file
85
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_codec.cpp
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFPQ.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 8000;
|
||||
|
||||
double eval_codec_error(long ncentroids, long m, const std::vector<float>& v) {
|
||||
faiss::IndexFlatL2 coarse_quantizer(d);
|
||||
faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, m, 8);
|
||||
index.pq.cp.niter = 10; // speed up train
|
||||
index.train(nb, v.data());
|
||||
|
||||
// encode and decode to compute reconstruction error
|
||||
|
||||
std::vector<faiss::idx_t> keys(nb);
|
||||
std::vector<uint8_t> codes(nb * m);
|
||||
index.encode_multiple(nb, keys.data(), v.data(), codes.data(), true);
|
||||
|
||||
std::vector<float> v2(nb * d);
|
||||
index.decode_multiple(nb, keys.data(), codes.data(), v2.data());
|
||||
|
||||
return faiss::fvec_L2sqr(v.data(), v2.data(), nb * d);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool runs_on_sandcastle() {
|
||||
// see discussion here https://fburl.com/qc5kpdo2
|
||||
const char* sandcastle = getenv("SANDCASTLE");
|
||||
if (sandcastle && !strcmp(sandcastle, "1")) {
|
||||
return true;
|
||||
}
|
||||
const char* tw_job_user = getenv("TW_JOB_USER");
|
||||
if (tw_job_user && !strcmp(tw_job_user, "sandcastle")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
TEST(IVFPQ, codec) {
|
||||
std::vector<float> database(nb * d);
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
|
||||
// limit number of threads when running on heavily parallelized test
|
||||
// environment
|
||||
if (runs_on_sandcastle()) {
|
||||
omp_set_num_threads(2);
|
||||
}
|
||||
|
||||
double err0 = eval_codec_error(16, 8, database);
|
||||
|
||||
// should be more accurate as there are more coarse centroids
|
||||
double err1 = eval_codec_error(128, 8, database);
|
||||
EXPECT_GT(err0, err1);
|
||||
|
||||
// should be more accurate as there are more PQ codes
|
||||
double err2 = eval_codec_error(16, 16, database);
|
||||
EXPECT_GT(err0, err2);
|
||||
}
|
||||
93
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_indexing.cpp
vendored
Normal file
93
packages/leann-backend-hnsw/third_party/faiss/tests/test_ivfpq_indexing.cpp
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFPQ.h>
|
||||
|
||||
TEST(IVFPQ, accuracy) {
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 1000;
|
||||
|
||||
// make a set of nt training vectors in the unit cube
|
||||
// (could be the database)
|
||||
size_t nt = 1500;
|
||||
|
||||
// make the index object and train it
|
||||
faiss::IndexFlatL2 coarse_quantizer(d);
|
||||
|
||||
// a reasonable number of cetroids to index nb vectors
|
||||
int ncentroids = 25;
|
||||
|
||||
faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 16, 8);
|
||||
|
||||
// index that gives the ground-truth
|
||||
faiss::IndexFlatL2 index_gt(d);
|
||||
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
{ // training
|
||||
|
||||
std::vector<float> trainvecs(nt * d);
|
||||
for (size_t i = 0; i < nt * d; i++) {
|
||||
trainvecs[i] = distrib(rng);
|
||||
}
|
||||
index.verbose = true;
|
||||
index.train(nt, trainvecs.data());
|
||||
}
|
||||
|
||||
{ // populating the database
|
||||
|
||||
std::vector<float> database(nb * d);
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
|
||||
index.add(nb, database.data());
|
||||
index_gt.add(nb, database.data());
|
||||
}
|
||||
|
||||
int nq = 200;
|
||||
int n_ok;
|
||||
|
||||
{ // searching the database
|
||||
|
||||
std::vector<float> queries(nq * d);
|
||||
for (size_t i = 0; i < nq * d; i++) {
|
||||
queries[i] = distrib(rng);
|
||||
}
|
||||
|
||||
std::vector<faiss::idx_t> gt_nns(nq);
|
||||
std::vector<float> gt_dis(nq);
|
||||
|
||||
index_gt.search(nq, queries.data(), 1, gt_dis.data(), gt_nns.data());
|
||||
|
||||
index.nprobe = 5;
|
||||
int k = 5;
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<float> dis(k * nq);
|
||||
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
|
||||
n_ok = 0;
|
||||
for (int q = 0; q < nq; q++) {
|
||||
for (int i = 0; i < k; i++)
|
||||
if (nns[q * k + i] == gt_nns[q])
|
||||
n_ok++;
|
||||
}
|
||||
EXPECT_GT(n_ok, nq * 0.4);
|
||||
}
|
||||
}
|
||||
549
packages/leann-backend-hnsw/third_party/faiss/tests/test_lowlevel_ivf.cpp
vendored
Normal file
549
packages/leann-backend-hnsw/third_party/faiss/tests/test_lowlevel_ivf.cpp
vendored
Normal file
@@ -0,0 +1,549 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexBinaryIVF.h>
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/IndexPreTransform.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
namespace {
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 32;
|
||||
|
||||
// nb of training vectors
|
||||
size_t nt = 5000;
|
||||
|
||||
// size of the database points per window step
|
||||
size_t nb = 1000;
|
||||
|
||||
// nb of queries
|
||||
size_t nq = 200;
|
||||
|
||||
int k = 10;
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
std::vector<float> make_data(size_t n) {
|
||||
std::vector<float> database(n * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<Index> make_trained_index(
|
||||
const char* index_type,
|
||||
MetricType metric_type) {
|
||||
auto index =
|
||||
std::unique_ptr<Index>(index_factory(d, index_type, metric_type));
|
||||
auto xt = make_data(nt);
|
||||
index->train(nt, xt.data());
|
||||
ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
|
||||
return index;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index(Index* index, const float* xq) {
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
return I;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test functions for a given index type
|
||||
*************************************************************/
|
||||
|
||||
void test_lowlevel_access(const char* index_key, MetricType metric) {
|
||||
std::unique_ptr<Index> index = make_trained_index(index_key, metric);
|
||||
|
||||
auto xb = make_data(nb);
|
||||
index->add(nb, xb.data());
|
||||
|
||||
/** handle the case if we have a preprocessor */
|
||||
|
||||
const IndexPreTransform* index_pt =
|
||||
dynamic_cast<const IndexPreTransform*>(index.get());
|
||||
|
||||
int dt = index->d;
|
||||
const float* xbt = xb.data();
|
||||
std::unique_ptr<float[]> del_xbt;
|
||||
|
||||
if (index_pt) {
|
||||
dt = index_pt->index->d;
|
||||
xbt = index_pt->apply_chain(nb, xb.data());
|
||||
if (xbt != xb.data()) {
|
||||
del_xbt.reset((float*)xbt);
|
||||
}
|
||||
}
|
||||
|
||||
IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
|
||||
|
||||
/** Test independent encoding
|
||||
*
|
||||
* Makes it possible to do additions on a custom inverted list
|
||||
* implementation. From a set of vectors, computes the inverted
|
||||
* list ids + the codes corresponding to each vector.
|
||||
*/
|
||||
|
||||
std::vector<idx_t> list_nos(nb);
|
||||
std::vector<uint8_t> codes(index_ivf->code_size * nb);
|
||||
index_ivf->quantizer->assign(nb, xbt, list_nos.data());
|
||||
index_ivf->encode_vectors(nb, xbt, list_nos.data(), codes.data());
|
||||
|
||||
// compare with normal IVF addition
|
||||
|
||||
const InvertedLists* il = index_ivf->invlists;
|
||||
|
||||
for (int list_no = 0; list_no < index_ivf->nlist; list_no++) {
|
||||
InvertedLists::ScopedCodes ivf_codes(il, list_no);
|
||||
InvertedLists::ScopedIds ivf_ids(il, list_no);
|
||||
size_t list_size = il->list_size(list_no);
|
||||
for (int i = 0; i < list_size; i++) {
|
||||
const uint8_t* ref_code = ivf_codes.get() + i * il->code_size;
|
||||
const uint8_t* new_code = codes.data() + ivf_ids[i] * il->code_size;
|
||||
EXPECT_EQ(memcmp(ref_code, new_code, il->code_size), 0);
|
||||
}
|
||||
}
|
||||
|
||||
/** Test independent search
|
||||
*
|
||||
* Manually scans through inverted lists, computing distances and
|
||||
* ordering results organized in a heap.
|
||||
*/
|
||||
|
||||
// sample some example queries and get reference search results.
|
||||
auto xq = make_data(nq);
|
||||
auto ref_I = search_index(index.get(), xq.data());
|
||||
|
||||
// handle preprocessing
|
||||
const float* xqt = xq.data();
|
||||
std::unique_ptr<float[]> del_xqt;
|
||||
|
||||
if (index_pt) {
|
||||
xqt = index_pt->apply_chain(nq, xq.data());
|
||||
if (xqt != xq.data()) {
|
||||
del_xqt.reset((float*)xqt);
|
||||
}
|
||||
}
|
||||
|
||||
// quantize the queries to get the inverted list ids to visit.
|
||||
int nprobe = index_ivf->nprobe;
|
||||
|
||||
std::vector<idx_t> q_lists(nq * nprobe);
|
||||
std::vector<float> q_dis(nq * nprobe);
|
||||
|
||||
index_ivf->quantizer->search(nq, xqt, nprobe, q_dis.data(), q_lists.data());
|
||||
|
||||
// object that does the scanning and distance computations.
|
||||
std::unique_ptr<InvertedListScanner> scanner(
|
||||
index_ivf->get_InvertedListScanner());
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
std::vector<idx_t> I(k, -1);
|
||||
float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
|
||||
std::vector<float> D(k, default_dis);
|
||||
|
||||
scanner->set_query(xqt + i * dt);
|
||||
|
||||
for (int j = 0; j < nprobe; j++) {
|
||||
int list_no = q_lists[i * nprobe + j];
|
||||
if (list_no < 0)
|
||||
continue;
|
||||
scanner->set_list(list_no, q_dis[i * nprobe + j]);
|
||||
|
||||
// here we get the inverted lists from the InvertedLists
|
||||
// object but they could come from anywhere
|
||||
|
||||
scanner->scan_codes(
|
||||
il->list_size(list_no),
|
||||
InvertedLists::ScopedCodes(il, list_no).get(),
|
||||
InvertedLists::ScopedIds(il, list_no).get(),
|
||||
D.data(),
|
||||
I.data(),
|
||||
k);
|
||||
|
||||
if (j == 0) {
|
||||
// all results so far come from list_no, so let's check if
|
||||
// the distance function works
|
||||
for (int jj = 0; jj < k; jj++) {
|
||||
int vno = I[jj];
|
||||
if (vno < 0)
|
||||
break; // heap is not full yet
|
||||
|
||||
// we have the codes from the addition test
|
||||
float computed_D = scanner->distance_to_code(
|
||||
codes.data() + vno * il->code_size);
|
||||
|
||||
EXPECT_FLOAT_EQ(computed_D, D[jj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-order heap
|
||||
if (metric == METRIC_L2) {
|
||||
maxheap_reorder(k, D.data(), I.data());
|
||||
} else {
|
||||
minheap_reorder(k, D.data(), I.data());
|
||||
}
|
||||
|
||||
// check that we have the same results as the reference search
|
||||
for (int j = 0; j < k; j++) {
|
||||
EXPECT_EQ(I[j], ref_I[i * k + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
/*************************************************************
|
||||
* Test entry points
|
||||
*************************************************************/
|
||||
|
||||
TEST(TestLowLevelIVF, IVFFlatL2) {
|
||||
test_lowlevel_access("IVF32,Flat", METRIC_L2);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, PCAIVFFlatL2) {
|
||||
test_lowlevel_access("PCAR16,IVF32,Flat", METRIC_L2);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, IVFFlatIP) {
|
||||
test_lowlevel_access("IVF32,Flat", METRIC_INNER_PRODUCT);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, IVFSQL2) {
|
||||
test_lowlevel_access("IVF32,SQ8", METRIC_L2);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, IVFSQIP) {
|
||||
test_lowlevel_access("IVF32,SQ8", METRIC_INNER_PRODUCT);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, IVFPQL2) {
|
||||
test_lowlevel_access("IVF32,PQ4np", METRIC_L2);
|
||||
}
|
||||
|
||||
TEST(TestLowLevelIVF, IVFPQIP) {
|
||||
test_lowlevel_access("IVF32,PQ4np", METRIC_INNER_PRODUCT);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Same for binary (a bit simpler)
|
||||
*************************************************************/
|
||||
|
||||
namespace {
|
||||
|
||||
int nbit = 256;
|
||||
|
||||
// here d is used the number of ints -> d=32 means 128 bits
|
||||
|
||||
std::vector<uint8_t> make_data_binary(size_t n) {
|
||||
std::vector<uint8_t> database(n * nbit / 8);
|
||||
std::uniform_int_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<IndexBinary> make_trained_index_binary(const char* index_type) {
|
||||
auto index = std::unique_ptr<IndexBinary>(
|
||||
index_binary_factory(nbit, index_type));
|
||||
auto xt = make_data_binary(nt);
|
||||
index->train(nt, xt.data());
|
||||
return index;
|
||||
}
|
||||
|
||||
void test_lowlevel_access_binary(const char* index_key) {
|
||||
std::unique_ptr<IndexBinary> index = make_trained_index_binary(index_key);
|
||||
|
||||
IndexBinaryIVF* index_ivf = dynamic_cast<IndexBinaryIVF*>(index.get());
|
||||
assert(index_ivf);
|
||||
|
||||
index_ivf->nprobe = 4;
|
||||
|
||||
auto xb = make_data_binary(nb);
|
||||
index->add(nb, xb.data());
|
||||
|
||||
std::vector<idx_t> list_nos(nb);
|
||||
index_ivf->quantizer->assign(nb, xb.data(), list_nos.data());
|
||||
|
||||
/* For binary there is no test for encoding because binary vectors
|
||||
* are copied verbatim to the inverted lists */
|
||||
|
||||
const InvertedLists* il = index_ivf->invlists;
|
||||
|
||||
/** Test independent search
|
||||
*
|
||||
* Manually scans through inverted lists, computing distances and
|
||||
* ordering results organized in a heap.
|
||||
*/
|
||||
|
||||
// sample some example queries and get reference search results.
|
||||
auto xq = make_data_binary(nq);
|
||||
|
||||
std::vector<idx_t> I_ref(k * nq);
|
||||
std::vector<int32_t> D_ref(k * nq);
|
||||
index->search(nq, xq.data(), k, D_ref.data(), I_ref.data());
|
||||
|
||||
// quantize the queries to get the inverted list ids to visit.
|
||||
int nprobe = index_ivf->nprobe;
|
||||
|
||||
std::vector<idx_t> q_lists(nq * nprobe);
|
||||
std::vector<int32_t> q_dis(nq * nprobe);
|
||||
|
||||
// quantize queries
|
||||
index_ivf->quantizer->search(
|
||||
nq, xq.data(), nprobe, q_dis.data(), q_lists.data());
|
||||
|
||||
// object that does the scanning and distance computations.
|
||||
std::unique_ptr<BinaryInvertedListScanner> scanner(
|
||||
index_ivf->get_InvertedListScanner());
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
std::vector<idx_t> I(k, -1);
|
||||
uint32_t default_dis = 1 << 30;
|
||||
std::vector<int32_t> D(k, default_dis);
|
||||
|
||||
scanner->set_query(xq.data() + i * index_ivf->code_size);
|
||||
|
||||
for (int j = 0; j < nprobe; j++) {
|
||||
int list_no = q_lists[i * nprobe + j];
|
||||
if (list_no < 0)
|
||||
continue;
|
||||
scanner->set_list(list_no, q_dis[i * nprobe + j]);
|
||||
|
||||
// here we get the inverted lists from the InvertedLists
|
||||
// object but they could come from anywhere
|
||||
|
||||
scanner->scan_codes(
|
||||
il->list_size(list_no),
|
||||
InvertedLists::ScopedCodes(il, list_no).get(),
|
||||
InvertedLists::ScopedIds(il, list_no).get(),
|
||||
D.data(),
|
||||
I.data(),
|
||||
k);
|
||||
|
||||
if (j == 0) {
|
||||
// all results so far come from list_no, so let's check if
|
||||
// the distance function works
|
||||
for (int jj = 0; jj < k; jj++) {
|
||||
int vno = I[jj];
|
||||
if (vno < 0)
|
||||
break; // heap is not full yet
|
||||
|
||||
// we have the codes from the addition test
|
||||
float computed_D = scanner->distance_to_code(
|
||||
xb.data() + vno * il->code_size);
|
||||
|
||||
EXPECT_EQ(computed_D, D[jj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// re-order heap
|
||||
heap_reorder<CMax<int32_t, idx_t>>(k, D.data(), I.data());
|
||||
|
||||
// check that we have the same results as the reference search
|
||||
for (int j = 0; j < k; j++) {
|
||||
// here the order is not guaranteed to be the same
|
||||
// so we scan through ref results
|
||||
// EXPECT_EQ (I[j], I_ref[i * k + j]);
|
||||
EXPECT_LE(D[j], D_ref[i * k + k - 1]);
|
||||
if (D[j] < D_ref[i * k + k - 1]) {
|
||||
int j2 = 0;
|
||||
while (j2 < k) {
|
||||
if (I[j] == I_ref[i * k + j2])
|
||||
break;
|
||||
j2++;
|
||||
}
|
||||
EXPECT_LT(j2, k); // it was found
|
||||
if (j2 < k) {
|
||||
EXPECT_EQ(D[j], D_ref[i * k + j2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
TEST(TestLowLevelIVF, IVFBinary) {
|
||||
test_lowlevel_access_binary("BIVF32");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void test_threaded_search(const char* index_key, MetricType metric) {
|
||||
std::unique_ptr<Index> index = make_trained_index(index_key, metric);
|
||||
|
||||
auto xb = make_data(nb);
|
||||
index->add(nb, xb.data());
|
||||
|
||||
/** handle the case if we have a preprocessor */
|
||||
|
||||
const IndexPreTransform* index_pt =
|
||||
dynamic_cast<const IndexPreTransform*>(index.get());
|
||||
|
||||
int dt = index->d;
|
||||
const float* xbt = xb.data();
|
||||
std::unique_ptr<float[]> del_xbt;
|
||||
|
||||
if (index_pt) {
|
||||
dt = index_pt->index->d;
|
||||
xbt = index_pt->apply_chain(nb, xb.data());
|
||||
if (xbt != xb.data()) {
|
||||
del_xbt.reset((float*)xbt);
|
||||
}
|
||||
}
|
||||
|
||||
IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
|
||||
|
||||
/** Test independent search
|
||||
*
|
||||
* Manually scans through inverted lists, computing distances and
|
||||
* ordering results organized in a heap.
|
||||
*/
|
||||
|
||||
// sample some example queries and get reference search results.
|
||||
auto xq = make_data(nq);
|
||||
auto ref_I = search_index(index.get(), xq.data());
|
||||
|
||||
// handle preprocessing
|
||||
const float* xqt = xq.data();
|
||||
std::unique_ptr<float[]> del_xqt;
|
||||
|
||||
if (index_pt) {
|
||||
xqt = index_pt->apply_chain(nq, xq.data());
|
||||
if (xqt != xq.data()) {
|
||||
del_xqt.reset((float*)xqt);
|
||||
}
|
||||
}
|
||||
|
||||
// quantize the queries to get the inverted list ids to visit.
|
||||
int nprobe = index_ivf->nprobe;
|
||||
|
||||
std::vector<idx_t> q_lists(nq * nprobe);
|
||||
std::vector<float> q_dis(nq * nprobe);
|
||||
|
||||
index_ivf->quantizer->search(nq, xqt, nprobe, q_dis.data(), q_lists.data());
|
||||
|
||||
// now run search in this many threads
|
||||
int nproc = 3;
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
// one result table per thread
|
||||
std::vector<idx_t> I(k * nproc, -1);
|
||||
float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
|
||||
std::vector<float> D(k * nproc, default_dis);
|
||||
|
||||
auto search_function = [index_ivf,
|
||||
&I,
|
||||
&D,
|
||||
dt,
|
||||
i,
|
||||
nproc,
|
||||
xqt,
|
||||
nprobe,
|
||||
&q_dis,
|
||||
&q_lists](int rank) {
|
||||
const InvertedLists* il = index_ivf->invlists;
|
||||
|
||||
// object that does the scanning and distance computations.
|
||||
std::unique_ptr<InvertedListScanner> scanner(
|
||||
index_ivf->get_InvertedListScanner());
|
||||
|
||||
idx_t* local_I = I.data() + rank * k;
|
||||
float* local_D = D.data() + rank * k;
|
||||
|
||||
scanner->set_query(xqt + i * dt);
|
||||
|
||||
for (int j = rank; j < nprobe; j += nproc) {
|
||||
int list_no = q_lists[i * nprobe + j];
|
||||
if (list_no < 0)
|
||||
continue;
|
||||
scanner->set_list(list_no, q_dis[i * nprobe + j]);
|
||||
|
||||
scanner->scan_codes(
|
||||
il->list_size(list_no),
|
||||
InvertedLists::ScopedCodes(il, list_no).get(),
|
||||
InvertedLists::ScopedIds(il, list_no).get(),
|
||||
local_D,
|
||||
local_I,
|
||||
k);
|
||||
}
|
||||
};
|
||||
|
||||
// start the threads. Threads are numbered rank=0..nproc-1 (a la MPI)
|
||||
// thread rank takes care of inverted lists
|
||||
// rank, rank+nproc, rank+2*nproc,...
|
||||
std::vector<std::thread> threads;
|
||||
for (int rank = 0; rank < nproc; rank++) {
|
||||
threads.emplace_back(search_function, rank);
|
||||
}
|
||||
|
||||
// join threads, merge heaps
|
||||
for (int rank = 0; rank < nproc; rank++) {
|
||||
threads[rank].join();
|
||||
if (rank == 0)
|
||||
continue; // nothing to merge
|
||||
// merge into first result
|
||||
if (metric == METRIC_L2) {
|
||||
maxheap_addn(
|
||||
k,
|
||||
D.data(),
|
||||
I.data(),
|
||||
D.data() + rank * k,
|
||||
I.data() + rank * k,
|
||||
k);
|
||||
} else {
|
||||
minheap_addn(
|
||||
k,
|
||||
D.data(),
|
||||
I.data(),
|
||||
D.data() + rank * k,
|
||||
I.data() + rank * k,
|
||||
k);
|
||||
}
|
||||
}
|
||||
|
||||
// re-order heap
|
||||
if (metric == METRIC_L2) {
|
||||
maxheap_reorder(k, D.data(), I.data());
|
||||
} else {
|
||||
minheap_reorder(k, D.data(), I.data());
|
||||
}
|
||||
|
||||
// check that we have the same results as the reference search
|
||||
for (int j = 0; j < k; j++) {
|
||||
EXPECT_EQ(I[j], ref_I[i * k + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(TestLowLevelIVF, ThreadedSearch) {
|
||||
test_threaded_search("IVF32,Flat", METRIC_L2);
|
||||
}
|
||||
66
packages/leann-backend-hnsw/third_party/faiss/tests/test_mem_leak.cpp
vendored
Normal file
66
packages/leann-backend-hnsw/third_party/faiss/tests/test_mem_leak.cpp
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/utils/random.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
TEST(TestMemoryLeak, ivfflat) {
|
||||
size_t num_tfidf_faiss_cells = 20;
|
||||
size_t max_tfidf_features = 500;
|
||||
|
||||
IndexFlatIP quantizer(max_tfidf_features);
|
||||
IndexIVFFlat tfidf_faiss_index(
|
||||
&quantizer, max_tfidf_features, num_tfidf_faiss_cells);
|
||||
|
||||
std::vector<float> dense_matrix(5000 * max_tfidf_features);
|
||||
float_rand(dense_matrix.data(), dense_matrix.size(), 123);
|
||||
|
||||
tfidf_faiss_index.train(5000, dense_matrix.data());
|
||||
tfidf_faiss_index.add(5000, dense_matrix.data());
|
||||
|
||||
int N1 = 1000;
|
||||
int N2 = 10000;
|
||||
|
||||
std::vector<float> ent_substr_tfidfs_list(N1 * max_tfidf_features);
|
||||
float_rand(
|
||||
ent_substr_tfidfs_list.data(), ent_substr_tfidfs_list.size(), 1234);
|
||||
|
||||
for (int bs : {1, 4, 16}) {
|
||||
size_t m0 = get_mem_usage_kb();
|
||||
double t0 = getmillisecs();
|
||||
|
||||
for (int i = 0; i < N2; i++) {
|
||||
std::vector<idx_t> I(10 * bs);
|
||||
std::vector<float> D(10 * bs);
|
||||
|
||||
tfidf_faiss_index.search(
|
||||
bs,
|
||||
ent_substr_tfidfs_list.data() +
|
||||
(i % (N1 - bs + 1)) * max_tfidf_features,
|
||||
10,
|
||||
D.data(),
|
||||
I.data());
|
||||
if (i % 100 == 0) {
|
||||
printf("[%.2f s] BS %d %d: %ld kB %.2f bytes/it\r",
|
||||
(getmillisecs() - t0) / 1000,
|
||||
bs,
|
||||
i,
|
||||
get_mem_usage_kb(),
|
||||
(get_mem_usage_kb() - m0) * 1024.0 / (i + 1));
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
EXPECT_GE(50 * bs, (get_mem_usage_kb() - m0) * 1024.0 / N2);
|
||||
}
|
||||
}
|
||||
246
packages/leann-backend-hnsw/third_party/faiss/tests/test_merge.cpp
vendored
Normal file
246
packages/leann-backend-hnsw/third_party/faiss/tests/test_merge.cpp
vendored
Normal file
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <random>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/IndexPreTransform.h>
|
||||
#include <faiss/MetaIndexes.h>
|
||||
#include <faiss/invlists/OnDiskInvertedLists.h>
|
||||
|
||||
#include "test_util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
typedef faiss::idx_t idx_t;
|
||||
|
||||
// parameters to use for the test
|
||||
int d = 64;
|
||||
size_t nb = 1000;
|
||||
size_t nq = 100;
|
||||
int nindex = 4;
|
||||
int k = 10;
|
||||
int nlist = 40;
|
||||
int shard_size = nb / nindex;
|
||||
|
||||
struct CommonData {
|
||||
std::vector<float> database;
|
||||
std::vector<float> queries;
|
||||
std::vector<idx_t> ids;
|
||||
faiss::IndexFlatL2 quantizer;
|
||||
|
||||
CommonData() : database(nb * d), queries(nq * d), ids(nb), quantizer(d) {
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
for (size_t i = 0; i < nq * d; i++) {
|
||||
queries[i] = distrib(rng);
|
||||
}
|
||||
for (int i = 0; i < nb; i++) {
|
||||
ids[i] = 123 + 456 * i;
|
||||
}
|
||||
{ // just to train the quantizer
|
||||
faiss::IndexIVFFlat iflat(&quantizer, d, nlist);
|
||||
iflat.train(nb, database.data());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
CommonData cd;
|
||||
std::string temp_filename_template = "/tmp/faiss_tmp_XXXXXX";
|
||||
/// perform a search on shards, then merge and search again and
|
||||
/// compare results.
|
||||
int compare_merged(
|
||||
faiss::IndexShards* index_shards,
|
||||
bool shift_ids,
|
||||
bool standard_merge = true) {
|
||||
std::vector<idx_t> refI(k * nq);
|
||||
std::vector<float> refD(k * nq);
|
||||
|
||||
index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
|
||||
Tempfilename filename(&temp_file_mutex, temp_filename_template);
|
||||
|
||||
std::vector<idx_t> newI(k * nq);
|
||||
std::vector<float> newD(k * nq);
|
||||
|
||||
if (standard_merge) {
|
||||
for (int i = 1; i < nindex; i++) {
|
||||
faiss::ivflib::merge_into(
|
||||
index_shards->at(0), index_shards->at(i), shift_ids);
|
||||
}
|
||||
|
||||
index_shards->syncWithSubIndexes();
|
||||
} else {
|
||||
std::vector<const faiss::InvertedLists*> lists;
|
||||
faiss::IndexIVF* index0 = nullptr;
|
||||
size_t ntotal = 0;
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
auto index_ivf =
|
||||
dynamic_cast<faiss::IndexIVF*>(index_shards->at(i));
|
||||
assert(index_ivf);
|
||||
if (i == 0) {
|
||||
index0 = index_ivf;
|
||||
}
|
||||
lists.push_back(index_ivf->invlists);
|
||||
ntotal += index_ivf->ntotal;
|
||||
}
|
||||
|
||||
auto il = new faiss::OnDiskInvertedLists(
|
||||
index0->nlist, index0->code_size, filename.c_str());
|
||||
|
||||
il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
|
||||
|
||||
index0->replace_invlists(il, true);
|
||||
index0->ntotal = ntotal;
|
||||
}
|
||||
// search only on first index
|
||||
index_shards->at(0)->search(
|
||||
nq, cd.queries.data(), k, newD.data(), newI.data());
|
||||
|
||||
size_t ndiff = 0;
|
||||
bool adjust_ids = shift_ids && !standard_merge;
|
||||
for (size_t i = 0; i < k * nq; i++) {
|
||||
idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
|
||||
if (refI[i] != new_id) {
|
||||
ndiff++;
|
||||
}
|
||||
}
|
||||
|
||||
return ndiff;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// test on IVFFlat with implicit numbering
|
||||
TEST(MERGE, merge_flat_no_ids) {
|
||||
faiss::IndexShards index_shards(d);
|
||||
index_shards.own_indices = true;
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
index_shards.add_shard(
|
||||
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
|
||||
}
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add(nb, cd.database.data());
|
||||
size_t prev_ntotal = index_shards.ntotal;
|
||||
int ndiff = compare_merged(&index_shards, true);
|
||||
EXPECT_EQ(prev_ntotal, index_shards.ntotal);
|
||||
EXPECT_EQ(0, ndiff);
|
||||
}
|
||||
|
||||
// test on IVFFlat, explicit ids
|
||||
TEST(MERGE, merge_flat) {
|
||||
faiss::IndexShards index_shards(d, false, false);
|
||||
index_shards.own_indices = true;
|
||||
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
index_shards.add_shard(
|
||||
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
|
||||
}
|
||||
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
|
||||
int ndiff = compare_merged(&index_shards, false);
|
||||
EXPECT_GE(0, ndiff);
|
||||
}
|
||||
|
||||
// test on IVFFlat and a VectorTransform
|
||||
TEST(MERGE, merge_flat_vt) {
|
||||
faiss::IndexShards index_shards(d, false, false);
|
||||
index_shards.own_indices = true;
|
||||
|
||||
// here we have to retrain because of the vectorTransform
|
||||
faiss::RandomRotationMatrix rot(d, d);
|
||||
rot.init(1234);
|
||||
faiss::IndexFlatL2 quantizer(d);
|
||||
|
||||
{ // just to train the quantizer
|
||||
faiss::IndexIVFFlat iflat(&quantizer, d, nlist);
|
||||
faiss::IndexPreTransform ipt(&rot, &iflat);
|
||||
ipt.train(nb, cd.database.data());
|
||||
}
|
||||
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
faiss::IndexPreTransform* ipt = new faiss::IndexPreTransform(
|
||||
new faiss::RandomRotationMatrix(rot),
|
||||
new faiss::IndexIVFFlat(&quantizer, d, nlist));
|
||||
ipt->own_fields = true;
|
||||
index_shards.add_shard(ipt);
|
||||
}
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
|
||||
size_t prev_ntotal = index_shards.ntotal;
|
||||
int ndiff = compare_merged(&index_shards, false);
|
||||
EXPECT_EQ(prev_ntotal, index_shards.ntotal);
|
||||
EXPECT_GE(0, ndiff);
|
||||
}
|
||||
|
||||
// put the merged invfile on disk
|
||||
TEST(MERGE, merge_flat_ondisk) {
|
||||
faiss::IndexShards index_shards(d, false, false);
|
||||
index_shards.own_indices = true;
|
||||
Tempfilename filename(&temp_file_mutex, temp_filename_template);
|
||||
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
auto ivf = new faiss::IndexIVFFlat(&cd.quantizer, d, nlist);
|
||||
if (i == 0) {
|
||||
auto il = new faiss::OnDiskInvertedLists(
|
||||
ivf->nlist, ivf->code_size, filename.c_str());
|
||||
ivf->replace_invlists(il, true);
|
||||
}
|
||||
index_shards.add_shard(ivf);
|
||||
}
|
||||
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
|
||||
int ndiff = compare_merged(&index_shards, false);
|
||||
|
||||
EXPECT_EQ(ndiff, 0);
|
||||
}
|
||||
|
||||
// now use ondisk specific merge
|
||||
TEST(MERGE, merge_flat_ondisk_2) {
|
||||
faiss::IndexShards index_shards(d, false, false);
|
||||
index_shards.own_indices = true;
|
||||
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
index_shards.add_shard(
|
||||
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
|
||||
}
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add_with_ids(nb, cd.database.data(), cd.ids.data());
|
||||
int ndiff = compare_merged(&index_shards, false, false);
|
||||
EXPECT_GE(0, ndiff);
|
||||
}
|
||||
|
||||
// now use ondisk specific merge and use shift ids
|
||||
TEST(MERGE, merge_flat_ondisk_3) {
|
||||
faiss::IndexShards index_shards(d, false, false);
|
||||
index_shards.own_indices = true;
|
||||
|
||||
std::vector<idx_t> ids;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int id = i % shard_size;
|
||||
ids.push_back(id);
|
||||
}
|
||||
for (int i = 0; i < nindex; i++) {
|
||||
index_shards.add_shard(
|
||||
new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
|
||||
}
|
||||
EXPECT_TRUE(index_shards.is_trained);
|
||||
index_shards.add_with_ids(nb, cd.database.data(), ids.data());
|
||||
int ndiff = compare_merged(&index_shards, true, false);
|
||||
EXPECT_GE(0, ndiff);
|
||||
}
|
||||
265
packages/leann-backend-hnsw/third_party/faiss/tests/test_mmap.cpp
vendored
Normal file
265
packages/leann-backend-hnsw/third_party/faiss/tests/test_mmap.cpp
vendored
Normal file
@@ -0,0 +1,265 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/IndexBinaryFlat.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/index_io.h>
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
|
||||
std::vector<float> database(n * d);
|
||||
std::mt19937 rng(seed);
|
||||
std::uniform_real_distribution<float> distrib;
|
||||
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> make_binary_data(
|
||||
const size_t n,
|
||||
const size_t d,
|
||||
size_t seed) {
|
||||
std::vector<uint8_t> database(n * d);
|
||||
std::mt19937 rng(seed);
|
||||
std::uniform_int_distribution<uint8_t> distrib(0, 255);
|
||||
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// the logic is the following:
|
||||
// 1. generate two flatcodes-based indices, Index1 and Index2
|
||||
// 2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
|
||||
// 3. save Buf1 into a temporary file, File1
|
||||
// 4. deserialize Index1 using mmap feature on File1 into Index1MM
|
||||
// 5. ensure that Index1MM acts as Index2 if we write the data from Buf2
|
||||
// on top of the existing File1
|
||||
// 6. ensure that Index1MM acts as Index1 if we write the data from Buf1
|
||||
// on top of the existing File1 again
|
||||
|
||||
TEST(TestMmap, mmap_flatcodes) {
|
||||
// generate data
|
||||
const size_t nt = 1000;
|
||||
const size_t nq = 10;
|
||||
const size_t d = 32;
|
||||
const size_t k = 25;
|
||||
|
||||
std::vector<float> xt1 = make_data(nt, d, 123);
|
||||
std::vector<float> xt2 = make_data(nt, d, 456);
|
||||
std::vector<float> xq = make_data(nq, d, 789);
|
||||
|
||||
// ensure that the data is different
|
||||
ASSERT_NE(xt1, xt2);
|
||||
|
||||
// make index1 and create reference results
|
||||
faiss::IndexFlatL2 index1(d);
|
||||
index1.train(nt, xt1.data());
|
||||
index1.add(nt, xt1.data());
|
||||
|
||||
std::vector<float> ref_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_1(k * nq);
|
||||
index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
|
||||
|
||||
// make index2 and create reference results
|
||||
faiss::IndexFlatL2 index2(d);
|
||||
index2.train(nt, xt2.data());
|
||||
index2.add(nt, xt2.data());
|
||||
|
||||
std::vector<float> ref_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_2(k * nq);
|
||||
index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
|
||||
|
||||
// ensure that the results are different
|
||||
ASSERT_NE(ref_dis_1, ref_dis_2);
|
||||
ASSERT_NE(ref_ids_1, ref_ids_2);
|
||||
|
||||
// serialize both in a form of vectors
|
||||
faiss::VectorIOWriter wr1;
|
||||
faiss::write_index(&index1, &wr1);
|
||||
|
||||
faiss::VectorIOWriter wr2;
|
||||
faiss::write_index(&index2, &wr2);
|
||||
|
||||
// generate a temporary file and write index1 into it
|
||||
std::string tmpname = std::tmpnam(nullptr);
|
||||
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.write((const char*)wr1.data.data(), wr1.data.size());
|
||||
}
|
||||
|
||||
// create a mmap index
|
||||
std::unique_ptr<faiss::Index> index1mm(
|
||||
faiss::read_index(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
|
||||
|
||||
ASSERT_NE(index1mm, nullptr);
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_1(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_1);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_1);
|
||||
|
||||
// ok now, overwrite the internals of the file without recreating it
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.seekp(0, std::ios::beg);
|
||||
|
||||
ofs.write((const char*)wr2.data.data(), wr2.data.size());
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_2(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_2, cand_ids_2);
|
||||
ASSERT_EQ(ref_dis_2, cand_dis_2);
|
||||
|
||||
// write back data1
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.seekp(0, std::ios::beg);
|
||||
|
||||
ofs.write((const char*)wr1.data.data(), wr1.data.size());
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_3(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_3(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_3);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_3);
|
||||
}
|
||||
|
||||
TEST(TestMmap, mmap_binary_flatcodes) {
|
||||
// generate data
|
||||
const size_t nt = 1000;
|
||||
const size_t nq = 10;
|
||||
// in bits
|
||||
const size_t d = 64;
|
||||
// in bytes
|
||||
const size_t d8 = (d + 7) / 8;
|
||||
const size_t k = 25;
|
||||
|
||||
std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
|
||||
std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
|
||||
std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
|
||||
|
||||
// ensure that the data is different
|
||||
ASSERT_NE(xt1, xt2);
|
||||
|
||||
// make index1 and create reference results
|
||||
faiss::IndexBinaryFlat index1(d);
|
||||
index1.train(nt, xt1.data());
|
||||
index1.add(nt, xt1.data());
|
||||
|
||||
std::vector<int32_t> ref_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_1(k * nq);
|
||||
index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
|
||||
|
||||
// make index2 and create reference results
|
||||
faiss::IndexBinaryFlat index2(d);
|
||||
index2.train(nt, xt2.data());
|
||||
index2.add(nt, xt2.data());
|
||||
|
||||
std::vector<int32_t> ref_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_2(k * nq);
|
||||
index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
|
||||
|
||||
// ensure that the results are different
|
||||
ASSERT_NE(ref_dis_1, ref_dis_2);
|
||||
ASSERT_NE(ref_ids_1, ref_ids_2);
|
||||
|
||||
// serialize both in a form of vectors
|
||||
faiss::VectorIOWriter wr1;
|
||||
faiss::write_index_binary(&index1, &wr1);
|
||||
|
||||
faiss::VectorIOWriter wr2;
|
||||
faiss::write_index_binary(&index2, &wr2);
|
||||
|
||||
// generate a temporary file and write index1 into it
|
||||
std::string tmpname = std::tmpnam(nullptr);
|
||||
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.write((const char*)wr1.data.data(), wr1.data.size());
|
||||
}
|
||||
|
||||
// create a mmap index
|
||||
std::unique_ptr<faiss::IndexBinary> index1mm(
|
||||
faiss::read_index_binary(tmpname.c_str(), faiss::IO_FLAG_MMAP_IFC));
|
||||
|
||||
ASSERT_NE(index1mm, nullptr);
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_1(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_1);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_1);
|
||||
|
||||
// ok now, overwrite the internals of the file without recreating it
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.seekp(0, std::ios::beg);
|
||||
|
||||
ofs.write((const char*)wr2.data.data(), wr2.data.size());
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_2(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_2, cand_ids_2);
|
||||
ASSERT_EQ(ref_dis_2, cand_dis_2);
|
||||
|
||||
// write back data1
|
||||
{
|
||||
std::ofstream ofs(tmpname);
|
||||
ofs.seekp(0, std::ios::beg);
|
||||
|
||||
ofs.write((const char*)wr1.data.data(), wr1.data.size());
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_3(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_3(k * nq);
|
||||
index1mm->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_3);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_3);
|
||||
}
|
||||
14
packages/leann-backend-hnsw/third_party/faiss/tests/test_omp_threads.cpp
vendored
Normal file
14
packages/leann-backend-hnsw/third_party/faiss/tests/test_omp_threads.cpp
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
TEST(Threading, openmp) {
|
||||
EXPECT_TRUE(faiss::check_openmp());
|
||||
}
|
||||
206
packages/leann-backend-hnsw/third_party/faiss/tests/test_ondisk_ivf.cpp
vendored
Normal file
206
packages/leann-backend-hnsw/third_party/faiss/tests/test_ondisk_ivf.cpp
vendored
Normal file
@@ -0,0 +1,206 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <pthread.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/index_io.h>
|
||||
#include <faiss/invlists/OnDiskInvertedLists.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
namespace {
|
||||
|
||||
struct Tempfilename {
|
||||
static pthread_mutex_t mutex;
|
||||
|
||||
std::string filename = "/tmp/faiss_tmp_XXXXXX";
|
||||
|
||||
Tempfilename() {
|
||||
pthread_mutex_lock(&mutex);
|
||||
int fd = mkstemp(&filename[0]);
|
||||
close(fd);
|
||||
pthread_mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
~Tempfilename() {
|
||||
if (access(filename.c_str(), F_OK)) {
|
||||
unlink(filename.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
const char* c_str() {
|
||||
return filename.c_str();
|
||||
}
|
||||
};
|
||||
|
||||
pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(ONDISK, make_invlists) {
|
||||
int nlist = 100;
|
||||
int code_size = 32;
|
||||
int nadd = 1000000;
|
||||
std::unordered_map<int, int> listnos;
|
||||
|
||||
Tempfilename filename;
|
||||
|
||||
faiss::OnDiskInvertedLists ivf(nlist, code_size, filename.c_str());
|
||||
|
||||
{
|
||||
std::vector<uint8_t> code(32);
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (int i = 0; i < nadd; i++) {
|
||||
double d = distrib(rng);
|
||||
int list_no = int(nlist * d * d); // skewed distribution
|
||||
int* ar = (int*)code.data();
|
||||
ar[0] = i;
|
||||
ar[1] = list_no;
|
||||
ivf.add_entry(list_no, i, code.data());
|
||||
listnos[i] = list_no;
|
||||
}
|
||||
}
|
||||
|
||||
int ntot = 0;
|
||||
for (int i = 0; i < nlist; i++) {
|
||||
int size = ivf.list_size(i);
|
||||
const faiss::idx_t* ids = ivf.get_ids(i);
|
||||
const uint8_t* codes = ivf.get_codes(i);
|
||||
for (int j = 0; j < size; j++) {
|
||||
faiss::idx_t id = ids[j];
|
||||
const int* ar = (const int*)&codes[code_size * j];
|
||||
EXPECT_EQ(ar[0], id);
|
||||
EXPECT_EQ(ar[1], i);
|
||||
EXPECT_EQ(listnos[id], i);
|
||||
ntot++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ntot, nadd);
|
||||
}
|
||||
|
||||
TEST(ONDISK, test_add) {
|
||||
int d = 8;
|
||||
int nlist = 30, nq = 200, nb = 1500, k = 10;
|
||||
faiss::IndexFlatL2 quantizer(d);
|
||||
{
|
||||
std::vector<float> x(d * nlist);
|
||||
faiss::float_rand(x.data(), d * nlist, 12345);
|
||||
quantizer.add(nlist, x.data());
|
||||
}
|
||||
std::vector<float> xb(d * nb);
|
||||
faiss::float_rand(xb.data(), d * nb, 23456);
|
||||
|
||||
faiss::IndexIVFFlat index(&quantizer, d, nlist);
|
||||
index.add(nb, xb.data());
|
||||
|
||||
std::vector<float> xq(d * nb);
|
||||
faiss::float_rand(xq.data(), d * nq, 34567);
|
||||
|
||||
std::vector<float> ref_D(nq * k);
|
||||
std::vector<faiss::idx_t> ref_I(nq * k);
|
||||
|
||||
index.search(nq, xq.data(), k, ref_D.data(), ref_I.data());
|
||||
|
||||
Tempfilename filename, filename2;
|
||||
|
||||
// test add + search
|
||||
{
|
||||
faiss::IndexIVFFlat index2(&quantizer, d, nlist);
|
||||
|
||||
faiss::OnDiskInvertedLists ivf(
|
||||
index.nlist, index.code_size, filename.c_str());
|
||||
|
||||
index2.replace_invlists(&ivf);
|
||||
|
||||
index2.add(nb, xb.data());
|
||||
|
||||
std::vector<float> new_D(nq * k);
|
||||
std::vector<faiss::idx_t> new_I(nq * k);
|
||||
|
||||
index2.search(nq, xq.data(), k, new_D.data(), new_I.data());
|
||||
|
||||
EXPECT_EQ(ref_D, new_D);
|
||||
EXPECT_EQ(ref_I, new_I);
|
||||
|
||||
write_index(&index2, filename2.c_str());
|
||||
}
|
||||
|
||||
// test io
|
||||
{
|
||||
faiss::Index* index3 = faiss::read_index(filename2.c_str());
|
||||
|
||||
std::vector<float> new_D(nq * k);
|
||||
std::vector<faiss::idx_t> new_I(nq * k);
|
||||
|
||||
index3->search(nq, xq.data(), k, new_D.data(), new_I.data());
|
||||
|
||||
EXPECT_EQ(ref_D, new_D);
|
||||
EXPECT_EQ(ref_I, new_I);
|
||||
|
||||
delete index3;
|
||||
}
|
||||
}
|
||||
|
||||
// WARN this thest will run multithreaded only in opt mode
|
||||
TEST(ONDISK, make_invlists_threaded) {
|
||||
int nlist = 100;
|
||||
int code_size = 32;
|
||||
int nadd = 1000000;
|
||||
|
||||
Tempfilename filename;
|
||||
|
||||
faiss::OnDiskInvertedLists ivf(nlist, code_size, filename.c_str());
|
||||
|
||||
std::vector<int> list_nos(nadd);
|
||||
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (int i = 0; i < nadd; i++) {
|
||||
double d = distrib(rng);
|
||||
list_nos[i] = int(nlist * d * d); // skewed distribution
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
std::vector<uint8_t> code(32);
|
||||
#pragma omp for
|
||||
for (int i = 0; i < nadd; i++) {
|
||||
int list_no = list_nos[i];
|
||||
int* ar = (int*)code.data();
|
||||
ar[0] = i;
|
||||
ar[1] = list_no;
|
||||
ivf.add_entry(list_no, i, code.data());
|
||||
}
|
||||
}
|
||||
|
||||
int ntot = 0;
|
||||
for (int i = 0; i < nlist; i++) {
|
||||
int size = ivf.list_size(i);
|
||||
const faiss::idx_t* ids = ivf.get_ids(i);
|
||||
const uint8_t* codes = ivf.get_codes(i);
|
||||
for (int j = 0; j < size; j++) {
|
||||
faiss::idx_t id = ids[j];
|
||||
const int* ar = (const int*)&codes[code_size * j];
|
||||
EXPECT_EQ(ar[0], id);
|
||||
EXPECT_EQ(ar[1], i);
|
||||
EXPECT_EQ(list_nos[id], i);
|
||||
ntot++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ntot, nadd);
|
||||
}
|
||||
194
packages/leann-backend-hnsw/third_party/faiss/tests/test_pairs_decoding.cpp
vendored
Normal file
194
packages/leann-backend-hnsw/third_party/faiss/tests/test_pairs_decoding.cpp
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/VectorTransform.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
namespace {
|
||||
|
||||
typedef faiss::idx_t idx_t;
|
||||
|
||||
/*************************************************************
|
||||
* Test utils
|
||||
*************************************************************/
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 8000;
|
||||
|
||||
// nb of queries
|
||||
size_t nq = 200;
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
std::vector<float> make_data(size_t n) {
|
||||
std::vector<float> database(n * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<faiss::Index> make_index(
|
||||
const char* index_type,
|
||||
const std::vector<float>& x) {
|
||||
auto index =
|
||||
std::unique_ptr<faiss::Index>(faiss::index_factory(d, index_type));
|
||||
index->train(nb, x.data());
|
||||
index->add(nb, x.data());
|
||||
return index;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test functions for a given index type
|
||||
*************************************************************/
|
||||
|
||||
bool test_search_centroid(const char* index_key) {
|
||||
std::vector<float> xb = make_data(nb); // database vectors
|
||||
auto index = make_index(index_key, xb);
|
||||
|
||||
/* First test: find the centroids associated to the database
|
||||
vectors and make sure that each vector does indeed appear in
|
||||
the inverted list corresponding to its centroid */
|
||||
|
||||
std::vector<idx_t> centroid_ids(nb);
|
||||
faiss::ivflib::search_centroid(
|
||||
index.get(), xb.data(), nb, centroid_ids.data());
|
||||
|
||||
const faiss::IndexIVF* ivf = faiss::ivflib::extract_index_ivf(index.get());
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
bool found = false;
|
||||
int list_no = centroid_ids[i];
|
||||
int list_size = ivf->invlists->list_size(list_no);
|
||||
auto* list = ivf->invlists->get_ids(list_no);
|
||||
|
||||
for (int j = 0; j < list_size; j++) {
|
||||
if (list[j] == i) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int test_search_and_return_centroids(const char* index_key) {
|
||||
std::vector<float> xb = make_data(nb); // database vectors
|
||||
auto index = make_index(index_key, xb);
|
||||
|
||||
std::vector<idx_t> centroid_ids(nb);
|
||||
faiss::ivflib::search_centroid(
|
||||
index.get(), xb.data(), nb, centroid_ids.data());
|
||||
|
||||
faiss::IndexIVF* ivf = faiss::ivflib::extract_index_ivf(index.get());
|
||||
ivf->nprobe = 4;
|
||||
|
||||
std::vector<float> xq = make_data(nq); // database vectors
|
||||
|
||||
int k = 5;
|
||||
|
||||
// compute a reference search result
|
||||
|
||||
std::vector<idx_t> refI(nq * k);
|
||||
std::vector<float> refD(nq * k);
|
||||
index->search(nq, xq.data(), k, refD.data(), refI.data());
|
||||
|
||||
// compute search result
|
||||
|
||||
std::vector<idx_t> newI(nq * k);
|
||||
std::vector<float> newD(nq * k);
|
||||
|
||||
std::vector<idx_t> query_centroid_ids(nq);
|
||||
std::vector<idx_t> result_centroid_ids(nq * k);
|
||||
|
||||
faiss::ivflib::search_and_return_centroids(
|
||||
index.get(),
|
||||
nq,
|
||||
xq.data(),
|
||||
k,
|
||||
newD.data(),
|
||||
newI.data(),
|
||||
query_centroid_ids.data(),
|
||||
result_centroid_ids.data());
|
||||
|
||||
// first verify that we have the same result as the standard search
|
||||
|
||||
if (newI != refI) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// then check if the result ids are indeed in the inverted list
|
||||
// they are supposed to be in
|
||||
|
||||
for (int i = 0; i < nq * k; i++) {
|
||||
int list_no = result_centroid_ids[i];
|
||||
int result_no = newI[i];
|
||||
|
||||
if (result_no < 0)
|
||||
continue;
|
||||
|
||||
bool found = false;
|
||||
|
||||
int list_size = ivf->invlists->list_size(list_no);
|
||||
auto* list = ivf->invlists->get_ids(list_no);
|
||||
|
||||
for (int j = 0; j < list_size; j++) {
|
||||
if (list[j] == result_no) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/*************************************************************
|
||||
* Test entry points
|
||||
*************************************************************/
|
||||
|
||||
TEST(testSearchCentroid, IVFFlat) {
|
||||
bool ok = test_search_centroid("IVF32,Flat");
|
||||
EXPECT_TRUE(ok);
|
||||
}
|
||||
|
||||
TEST(testSearchCentroid, PCAIVFFlat) {
|
||||
bool ok = test_search_centroid("PCA16,IVF32,Flat");
|
||||
EXPECT_TRUE(ok);
|
||||
}
|
||||
|
||||
TEST(testSearchAndReturnCentroids, IVFFlat) {
|
||||
int err = test_search_and_return_centroids("IVF32,Flat");
|
||||
EXPECT_NE(err, 1);
|
||||
EXPECT_NE(err, 2);
|
||||
}
|
||||
|
||||
TEST(testSearchAndReturnCentroids, PCAIVFFlat) {
|
||||
int err = test_search_and_return_centroids("PCA16,IVF32,Flat");
|
||||
EXPECT_NE(err, 1);
|
||||
EXPECT_NE(err, 2);
|
||||
}
|
||||
287
packages/leann-backend-hnsw/third_party/faiss/tests/test_params_override.cpp
vendored
Normal file
287
packages/leann-backend-hnsw/third_party/faiss/tests/test_params_override.cpp
vendored
Normal file
@@ -0,0 +1,287 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexBinaryIVF.h>
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/clone_index.h>
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
#include <faiss/impl/IDSelector.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
namespace {
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 32;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 1000;
|
||||
|
||||
// nb of queries
|
||||
size_t nq = 200;
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
std::vector<float> make_data(size_t n) {
|
||||
std::vector<float> database(n * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<Index> make_index(
|
||||
const char* index_type,
|
||||
MetricType metric,
|
||||
const std::vector<float>& x) {
|
||||
assert(x.size() % d == 0);
|
||||
idx_t nb = x.size() / d;
|
||||
std::unique_ptr<Index> index(index_factory(d, index_type, metric));
|
||||
index->train(nb, x.data());
|
||||
index->add(nb, x.data());
|
||||
return index;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index(Index* index, const float* xq) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
return I;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index_with_params(
|
||||
Index* index,
|
||||
const float* xq,
|
||||
IVFSearchParameters* params) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
ivflib::search_with_parameters(
|
||||
index, nq, xq, k, D.data(), I.data(), params);
|
||||
return I;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test functions for a given index type
|
||||
*************************************************************/
|
||||
|
||||
int test_params_override(const char* index_key, MetricType metric) {
|
||||
std::vector<float> xb = make_data(nb); // database vectors
|
||||
auto index = make_index(index_key, metric, xb);
|
||||
// index->train(nb, xb.data());
|
||||
// index->add(nb, xb.data());
|
||||
std::vector<float> xq = make_data(nq);
|
||||
ParameterSpace ps;
|
||||
ps.set_index_parameter(index.get(), "nprobe", 2);
|
||||
auto res2ref = search_index(index.get(), xq.data());
|
||||
ps.set_index_parameter(index.get(), "nprobe", 9);
|
||||
auto res9ref = search_index(index.get(), xq.data());
|
||||
ps.set_index_parameter(index.get(), "nprobe", 1);
|
||||
|
||||
IVFSearchParameters params;
|
||||
params.max_codes = 0;
|
||||
params.nprobe = 2;
|
||||
auto res2new = search_index_with_params(index.get(), xq.data(), ¶ms);
|
||||
params.nprobe = 9;
|
||||
auto res9new = search_index_with_params(index.get(), xq.data(), ¶ms);
|
||||
|
||||
if (res2ref != res2new)
|
||||
return 2;
|
||||
|
||||
if (res9ref != res9new)
|
||||
return 9;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test subsets
|
||||
*************************************************************/
|
||||
|
||||
int test_selector(const char* index_key) {
|
||||
std::vector<float> xb = make_data(nb); // database vectors
|
||||
std::vector<float> xq = make_data(nq);
|
||||
ParameterSpace ps;
|
||||
|
||||
std::vector<float> sub_xb;
|
||||
std::vector<idx_t> kept;
|
||||
for (idx_t i = 0; i < nb; i++) {
|
||||
if (i % 10 == 2) {
|
||||
kept.push_back(i);
|
||||
sub_xb.insert(
|
||||
sub_xb.end(), xb.begin() + i * d, xb.begin() + (i + 1) * d);
|
||||
}
|
||||
}
|
||||
|
||||
// full index
|
||||
auto index = make_index(index_key, METRIC_L2, xb);
|
||||
ps.set_index_parameter(index.get(), "nprobe", 3);
|
||||
|
||||
// restricted index
|
||||
std::unique_ptr<Index> sub_index(clone_index(index.get()));
|
||||
sub_index->reset();
|
||||
sub_index->add_with_ids(kept.size(), sub_xb.data(), kept.data());
|
||||
|
||||
auto ref_result = search_index(sub_index.get(), xq.data());
|
||||
|
||||
IVFSearchParameters params;
|
||||
params.max_codes = 0;
|
||||
params.nprobe = 3;
|
||||
IDSelectorBatch sel(kept.size(), kept.data());
|
||||
params.sel = &sel;
|
||||
auto new_result = search_index_with_params(index.get(), xq.data(), ¶ms);
|
||||
|
||||
if (ref_result != new_result) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/*************************************************************
|
||||
* Test entry points
|
||||
*************************************************************/
|
||||
|
||||
TEST(TPO, IVFFlat) {
|
||||
int err1 = test_params_override("IVF32,Flat", METRIC_L2);
|
||||
EXPECT_EQ(err1, 0);
|
||||
int err2 = test_params_override("IVF32,Flat", METRIC_INNER_PRODUCT);
|
||||
EXPECT_EQ(err2, 0);
|
||||
}
|
||||
|
||||
TEST(TPO, IVFPQ) {
|
||||
int err1 = test_params_override("IVF32,PQ8np", METRIC_L2);
|
||||
EXPECT_EQ(err1, 0);
|
||||
int err2 = test_params_override("IVF32,PQ8np", METRIC_INNER_PRODUCT);
|
||||
EXPECT_EQ(err2, 0);
|
||||
}
|
||||
|
||||
TEST(TPO, IVFSQ) {
|
||||
int err1 = test_params_override("IVF32,SQ8", METRIC_L2);
|
||||
EXPECT_EQ(err1, 0);
|
||||
int err2 = test_params_override("IVF32,SQ8", METRIC_INNER_PRODUCT);
|
||||
EXPECT_EQ(err2, 0);
|
||||
}
|
||||
|
||||
TEST(TPO, IVFFlatPP) {
|
||||
int err1 = test_params_override("PCA16,IVF32,SQ8", METRIC_L2);
|
||||
EXPECT_EQ(err1, 0);
|
||||
int err2 = test_params_override("PCA16,IVF32,SQ8", METRIC_INNER_PRODUCT);
|
||||
EXPECT_EQ(err2, 0);
|
||||
}
|
||||
|
||||
TEST(TSEL, IVFFlat) {
|
||||
int err = test_selector("PCA16,IVF32,Flat");
|
||||
EXPECT_EQ(err, 0);
|
||||
}
|
||||
|
||||
TEST(TSEL, IVFFPQ) {
|
||||
int err = test_selector("PCA16,IVF32,PQ4x8np");
|
||||
EXPECT_EQ(err, 0);
|
||||
}
|
||||
|
||||
TEST(TSEL, IVFFSQ) {
|
||||
int err = test_selector("PCA16,IVF32,SQ8");
|
||||
EXPECT_EQ(err, 0);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Same for binary indexes
|
||||
*************************************************************/
|
||||
|
||||
std::vector<uint8_t> make_data_binary(size_t n) {
|
||||
std::vector<uint8_t> database(n * d / 8);
|
||||
std::uniform_int_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d / 8; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<IndexBinaryIVF> make_index(
|
||||
const char* index_type,
|
||||
const std::vector<uint8_t>& x) {
|
||||
auto index = std::unique_ptr<IndexBinaryIVF>(
|
||||
dynamic_cast<IndexBinaryIVF*>(index_binary_factory(d, index_type)));
|
||||
index->train(nb, x.data());
|
||||
index->add(nb, x.data());
|
||||
return index;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index(IndexBinaryIVF* index, const uint8_t* xq) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<int32_t> D(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
return I;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index_with_params(
|
||||
IndexBinaryIVF* index,
|
||||
const uint8_t* xq,
|
||||
IVFSearchParameters* params) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<int32_t> D(k * nq);
|
||||
|
||||
std::vector<idx_t> Iq(params->nprobe * nq);
|
||||
std::vector<int32_t> Dq(params->nprobe * nq);
|
||||
|
||||
index->quantizer->search(nq, xq, params->nprobe, Dq.data(), Iq.data());
|
||||
index->search_preassigned(
|
||||
nq, xq, k, Iq.data(), Dq.data(), D.data(), I.data(), false, params);
|
||||
return I;
|
||||
}
|
||||
|
||||
int test_params_override_binary(const char* index_key) {
|
||||
std::vector<uint8_t> xb = make_data_binary(nb); // database vectors
|
||||
auto index = make_index(index_key, xb);
|
||||
index->train(nb, xb.data());
|
||||
index->add(nb, xb.data());
|
||||
std::vector<uint8_t> xq = make_data_binary(nq);
|
||||
index->nprobe = 2;
|
||||
auto res2ref = search_index(index.get(), xq.data());
|
||||
index->nprobe = 9;
|
||||
auto res9ref = search_index(index.get(), xq.data());
|
||||
index->nprobe = 1;
|
||||
|
||||
IVFSearchParameters params;
|
||||
params.max_codes = 0;
|
||||
params.nprobe = 2;
|
||||
auto res2new = search_index_with_params(index.get(), xq.data(), ¶ms);
|
||||
params.nprobe = 9;
|
||||
auto res9new = search_index_with_params(index.get(), xq.data(), ¶ms);
|
||||
|
||||
if (res2ref != res2new)
|
||||
return 2;
|
||||
|
||||
if (res9ref != res9new)
|
||||
return 9;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
TEST(TPOB, IVF) {
|
||||
int err1 = test_params_override_binary("BIVF32");
|
||||
EXPECT_EQ(err1, 0);
|
||||
}
|
||||
33
packages/leann-backend-hnsw/third_party/faiss/tests/test_partitioning.cpp
vendored
Normal file
33
packages/leann-backend-hnsw/third_party/faiss/tests/test_partitioning.cpp
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/utils/AlignedTable.h>
|
||||
#include <faiss/utils/partitioning.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
typedef AlignedTable<uint16_t> AlignedTableUint16;
|
||||
|
||||
// TODO: This test fails when Faiss is compiled with
|
||||
// GCC 13.2 from conda-forge with AVX2 enabled. This may be
|
||||
// a GCC bug that needs to be investigated further.
|
||||
// As of 16-AUG-2023 the Faiss conda packages are built
|
||||
// with GCC 11.2, so the published binaries are not affected.
|
||||
TEST(TestPartitioning, TestPartitioningBigRange) {
|
||||
auto n = 1024;
|
||||
AlignedTableUint16 tab(n);
|
||||
for (auto i = 0; i < n; i++) {
|
||||
tab[i] = i * 64;
|
||||
}
|
||||
int32_t hist[16]{};
|
||||
simd_histogram_16(tab.get(), n, 0, 12, hist);
|
||||
for (auto i = 0; i < 16; i++) {
|
||||
ASSERT_EQ(hist[i], 64);
|
||||
}
|
||||
}
|
||||
145
packages/leann-backend-hnsw/third_party/faiss/tests/test_pq_encoding.cpp
vendored
Normal file
145
packages/leann-backend-hnsw/third_party/faiss/tests/test_pq_encoding.cpp
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/IndexPQFastScan.h>
|
||||
#include <faiss/impl/ProductQuantizer.h>
|
||||
#include <faiss/impl/pq4_fast_scan.h>
|
||||
|
||||
namespace {
|
||||
|
||||
const std::vector<uint64_t> random_vector(size_t s) {
|
||||
std::vector<uint64_t> v(s, 0);
|
||||
for (size_t i = 0; i < s; ++i) {
|
||||
v[i] = rand();
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
const std::vector<float> random_vector_float(size_t s) {
|
||||
std::vector<float> v(s, 0);
|
||||
for (size_t i = 0; i < s; ++i) {
|
||||
v[i] = rand();
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(PQEncoderGeneric, encode) {
|
||||
const int nsubcodes = 97;
|
||||
const int minbits = 1;
|
||||
const int maxbits = 24;
|
||||
const std::vector<uint64_t> values = random_vector(nsubcodes);
|
||||
|
||||
for (int nbits = minbits; nbits <= maxbits; ++nbits) {
|
||||
std::cerr << "nbits = " << nbits << std::endl;
|
||||
|
||||
const uint64_t mask = (1ull << nbits) - 1;
|
||||
std::unique_ptr<uint8_t[]> codes(
|
||||
new uint8_t[(nsubcodes * maxbits + 7) / 8]);
|
||||
|
||||
// NOTE(hoss): Necessary scope to ensure trailing bits are flushed to
|
||||
// mem.
|
||||
{
|
||||
faiss::PQEncoderGeneric encoder(codes.get(), nbits);
|
||||
for (const auto& v : values) {
|
||||
encoder.encode(v & mask);
|
||||
}
|
||||
}
|
||||
|
||||
faiss::PQDecoderGeneric decoder(codes.get(), nbits);
|
||||
for (int i = 0; i < nsubcodes; ++i) {
|
||||
uint64_t v = decoder.decode();
|
||||
EXPECT_EQ(values[i] & mask, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PQEncoder8, encode) {
|
||||
const int nsubcodes = 100;
|
||||
const std::vector<uint64_t> values = random_vector(nsubcodes);
|
||||
const uint64_t mask = 0xFF;
|
||||
std::unique_ptr<uint8_t[]> codes(new uint8_t[nsubcodes]);
|
||||
|
||||
faiss::PQEncoder8 encoder(codes.get(), 8);
|
||||
for (const auto& v : values) {
|
||||
encoder.encode(v & mask);
|
||||
}
|
||||
|
||||
faiss::PQDecoder8 decoder(codes.get(), 8);
|
||||
for (int i = 0; i < nsubcodes; ++i) {
|
||||
uint64_t v = decoder.decode();
|
||||
EXPECT_EQ(values[i] & mask, v);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PQEncoder16, encode) {
|
||||
const int nsubcodes = 100;
|
||||
const std::vector<uint64_t> values = random_vector(nsubcodes);
|
||||
const uint64_t mask = 0xFFFF;
|
||||
std::unique_ptr<uint8_t[]> codes(new uint8_t[2 * nsubcodes]);
|
||||
|
||||
faiss::PQEncoder16 encoder(codes.get(), 16);
|
||||
for (const auto& v : values) {
|
||||
encoder.encode(v & mask);
|
||||
}
|
||||
|
||||
faiss::PQDecoder16 decoder(codes.get(), 16);
|
||||
for (int i = 0; i < nsubcodes; ++i) {
|
||||
uint64_t v = decoder.decode();
|
||||
EXPECT_EQ(values[i] & mask, v);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PQFastScan, set_packed_element) {
|
||||
int d = 20, ntotal = 1000, M = 5, nbits = 4;
|
||||
const std::vector<float> ds = random_vector_float(ntotal * d);
|
||||
faiss::IndexPQFastScan index(d, M, nbits);
|
||||
index.train(ntotal, ds.data());
|
||||
index.add(ntotal, ds.data());
|
||||
|
||||
for (int j = 0; j < 10; j++) {
|
||||
int vector_id = rand() % ntotal;
|
||||
std::vector<uint8_t> old(ntotal * M);
|
||||
std::vector<uint8_t> code(M);
|
||||
for (int i = 0; i < ntotal; i++) {
|
||||
for (int sq = 0; sq < M; sq++) {
|
||||
old[i * M + sq] = faiss::pq4_get_packed_element(
|
||||
index.codes.data(), index.bbs, M, i, sq);
|
||||
}
|
||||
}
|
||||
for (int sq = 0; sq < M; sq++) {
|
||||
faiss::pq4_set_packed_element(
|
||||
index.codes.data(),
|
||||
((old[vector_id * M + sq] + 3) % 16),
|
||||
index.bbs,
|
||||
M,
|
||||
vector_id,
|
||||
sq);
|
||||
}
|
||||
for (int i = 0; i < ntotal; i++) {
|
||||
for (int sq = 0; sq < M; sq++) {
|
||||
uint8_t newcode = faiss::pq4_get_packed_element(
|
||||
index.codes.data(), index.bbs, M, i, sq);
|
||||
uint8_t oldcode = old[i * M + sq];
|
||||
if (i == vector_id) {
|
||||
EXPECT_EQ(newcode, (oldcode + 3) % 16);
|
||||
} else {
|
||||
EXPECT_EQ(newcode, oldcode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
264
packages/leann-backend-hnsw/third_party/faiss/tests/test_simdlib.cpp
vendored
Normal file
264
packages/leann-backend-hnsw/third_party/faiss/tests/test_simdlib.cpp
vendored
Normal file
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/utils/simdlib.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
TEST(TestSIMDLib, TestCmpltAndBlendInplace) {
|
||||
simd8float32 lowestValues(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
simd8uint32 lowestIndices(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
|
||||
simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
|
||||
cmplt_and_blend_inplace(
|
||||
candidateValues0, candidateIndices0, lowestValues, lowestIndices);
|
||||
|
||||
simd8float32 candidateValues1(6, 6, 6, 6, 6, 6, 6, 6);
|
||||
simd8uint32 candidateIndices1(20, 21, 22, 23, 24, 25, 26, 27);
|
||||
cmplt_and_blend_inplace(
|
||||
candidateValues1, candidateIndices1, lowestValues, lowestIndices);
|
||||
|
||||
simd8float32 candidateValues2(0, 1, 2, 3, 4, 5, 5, 5);
|
||||
simd8uint32 candidateIndices2(30, 31, 32, 33, 34, 35, 36, 37);
|
||||
cmplt_and_blend_inplace(
|
||||
candidateValues2, candidateIndices2, lowestValues, lowestIndices);
|
||||
|
||||
simd8float32 expectedValues(0, 1, 2, 3, 4, 5, 5, 5);
|
||||
simd8uint32 expectedIndices(0, 1, 2, 3, 4, 5, 16, 17);
|
||||
ASSERT_TRUE(lowestValues.is_same_as(expectedValues));
|
||||
ASSERT_TRUE(lowestIndices.is_same_as(expectedIndices));
|
||||
}
|
||||
|
||||
TEST(TestSIMDLib, TestCmpltMinMaxFloat) {
|
||||
simd8float32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8float32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
|
||||
simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
|
||||
simd8float32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
cmplt_min_max_fast(
|
||||
candidateValues0,
|
||||
candidateIndices0,
|
||||
currentValues0,
|
||||
currentIndices0,
|
||||
minValues,
|
||||
minIndices,
|
||||
maxValues,
|
||||
maxIndices);
|
||||
|
||||
simd8float32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
|
||||
simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
|
||||
ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
|
||||
ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
|
||||
|
||||
simd8float32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
|
||||
// the result is not 10,11,12,13,14,5,6,7 because it is _fast version
|
||||
simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
|
||||
ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
|
||||
ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
|
||||
}
|
||||
|
||||
TEST(TestSIMDLib, TestCmpltMinMaxInt) {
|
||||
simd8uint32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8uint32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
simd8uint32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
|
||||
simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
|
||||
simd8uint32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
cmplt_min_max_fast(
|
||||
candidateValues0,
|
||||
candidateIndices0,
|
||||
currentValues0,
|
||||
currentIndices0,
|
||||
minValues,
|
||||
minIndices,
|
||||
maxValues,
|
||||
maxIndices);
|
||||
|
||||
simd8uint32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
|
||||
simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
|
||||
ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
|
||||
ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
|
||||
|
||||
simd8uint32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
|
||||
// the result is not 10,11,12,13,14,5,6,7 because it is _fast version
|
||||
simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
|
||||
ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
|
||||
ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
|
||||
}
|
||||
|
||||
TEST(TestSIMDLib, TestCmpltMinMaxInt16) {
|
||||
simd16uint16 minValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd16uint16 minIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd16uint16 maxValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
simd16uint16 maxIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
simd16uint16 candidateValues0(
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005);
|
||||
simd16uint16 candidateIndices0(
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
1010,
|
||||
1011,
|
||||
1012,
|
||||
1013,
|
||||
1014,
|
||||
1015,
|
||||
1016,
|
||||
1017);
|
||||
simd16uint16 currentValues0(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1006,
|
||||
1007);
|
||||
simd16uint16 currentIndices0(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1006,
|
||||
1007);
|
||||
|
||||
cmplt_min_max_fast(
|
||||
candidateValues0,
|
||||
candidateIndices0,
|
||||
currentValues0,
|
||||
currentIndices0,
|
||||
minValues,
|
||||
minIndices,
|
||||
maxValues,
|
||||
maxIndices);
|
||||
|
||||
simd16uint16 expectedMinValues(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1005,
|
||||
1005);
|
||||
simd16uint16 expectedMinIndices(
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
16,
|
||||
17,
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1016,
|
||||
1017);
|
||||
ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
|
||||
ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
|
||||
|
||||
simd16uint16 expectedMaxValues(
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1005,
|
||||
1006,
|
||||
1007);
|
||||
// the result is not 10,11,12,13,14,5,6,7 because it is _fast version
|
||||
simd16uint16 expectedMaxIndices(
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
6,
|
||||
7,
|
||||
1010,
|
||||
1011,
|
||||
1012,
|
||||
1013,
|
||||
1014,
|
||||
1015,
|
||||
1006,
|
||||
1007);
|
||||
ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
|
||||
ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
|
||||
}
|
||||
205
packages/leann-backend-hnsw/third_party/faiss/tests/test_sliding_ivf.cpp
vendored
Normal file
205
packages/leann-backend-hnsw/third_party/faiss/tests/test_sliding_ivf.cpp
vendored
Normal file
@@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexIVF.h>
|
||||
#include <faiss/clone_index.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 32;
|
||||
|
||||
// nb of training vectors
|
||||
size_t nt = 5000;
|
||||
|
||||
// size of the database points per window step
|
||||
size_t nb = 1000;
|
||||
|
||||
// nb of queries
|
||||
size_t nq = 200;
|
||||
|
||||
int total_size = 40;
|
||||
int window_size = 10;
|
||||
|
||||
std::vector<float> make_data(size_t n) {
|
||||
std::vector<float> database(n * d);
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::unique_ptr<Index> make_trained_index(const char* index_type) {
|
||||
auto index = std::unique_ptr<Index>(index_factory(d, index_type));
|
||||
auto xt = make_data(nt * d);
|
||||
index->train(nt, xt.data());
|
||||
ParameterSpace().set_index_parameter(index.get(), "nprobe", 4);
|
||||
return index;
|
||||
}
|
||||
|
||||
std::vector<idx_t> search_index(Index* index, const float* xq) {
|
||||
int k = 10;
|
||||
std::vector<idx_t> I(k * nq);
|
||||
std::vector<float> D(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
return I;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test functions for a given index type
|
||||
*************************************************************/
|
||||
|
||||
// make a few slices of indexes that can be merged
|
||||
void make_index_slices(
|
||||
const Index* trained_index,
|
||||
std::vector<std::unique_ptr<Index>>& sub_indexes) {
|
||||
for (int i = 0; i < total_size; i++) {
|
||||
sub_indexes.emplace_back(clone_index(trained_index));
|
||||
|
||||
Index* index = sub_indexes.back().get();
|
||||
|
||||
auto xb = make_data(nb * d);
|
||||
std::vector<faiss::idx_t> ids(nb);
|
||||
std::mt19937 rng;
|
||||
std::uniform_int_distribution<> distrib;
|
||||
for (int j = 0; j < nb; j++) {
|
||||
ids[j] = distrib(rng);
|
||||
}
|
||||
index->add_with_ids(nb, xb.data(), ids.data());
|
||||
}
|
||||
}
|
||||
|
||||
// build merged index explicitly at sliding window position i
|
||||
Index* make_merged_index(
|
||||
const Index* trained_index,
|
||||
const std::vector<std::unique_ptr<Index>>& sub_indexes,
|
||||
int i) {
|
||||
Index* merged_index = clone_index(trained_index);
|
||||
for (int j = i - window_size + 1; j <= i; j++) {
|
||||
if (j < 0 || j >= total_size)
|
||||
continue;
|
||||
std::unique_ptr<Index> sub_index(clone_index(sub_indexes[j].get()));
|
||||
IndexIVF* ivf0 = ivflib::extract_index_ivf(merged_index);
|
||||
IndexIVF* ivf1 = ivflib::extract_index_ivf(sub_index.get());
|
||||
ivf0->merge_from(*ivf1, 0);
|
||||
merged_index->ntotal = ivf0->ntotal;
|
||||
}
|
||||
return merged_index;
|
||||
}
|
||||
|
||||
int test_sliding_window(const char* index_key) {
|
||||
std::unique_ptr<Index> trained_index = make_trained_index(index_key);
|
||||
|
||||
// make the index slices
|
||||
std::vector<std::unique_ptr<Index>> sub_indexes;
|
||||
|
||||
make_index_slices(trained_index.get(), sub_indexes);
|
||||
|
||||
// now slide over the windows
|
||||
std::unique_ptr<Index> index(clone_index(trained_index.get()));
|
||||
ivflib::SlidingIndexWindow window(index.get());
|
||||
|
||||
auto xq = make_data(nq * d);
|
||||
|
||||
for (int i = 0; i < total_size + window_size; i++) {
|
||||
// update the index
|
||||
window.step(
|
||||
i < total_size ? sub_indexes[i].get() : nullptr,
|
||||
i >= window_size);
|
||||
|
||||
auto new_res = search_index(index.get(), xq.data());
|
||||
|
||||
std::unique_ptr<Index> merged_index(
|
||||
make_merged_index(trained_index.get(), sub_indexes, i));
|
||||
|
||||
auto ref_res = search_index(merged_index.get(), xq.data());
|
||||
|
||||
EXPECT_EQ(ref_res.size(), new_res.size());
|
||||
|
||||
EXPECT_EQ(ref_res, new_res);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int test_sliding_invlists(const char* index_key) {
|
||||
std::unique_ptr<Index> trained_index = make_trained_index(index_key);
|
||||
|
||||
// make the index slices
|
||||
std::vector<std::unique_ptr<Index>> sub_indexes;
|
||||
|
||||
make_index_slices(trained_index.get(), sub_indexes);
|
||||
|
||||
// now slide over the windows
|
||||
std::unique_ptr<Index> index(clone_index(trained_index.get()));
|
||||
IndexIVF* index_ivf = ivflib::extract_index_ivf(index.get());
|
||||
|
||||
auto xq = make_data(nq * d);
|
||||
|
||||
for (int i = 0; i < total_size + window_size; i++) {
|
||||
// update the index
|
||||
std::vector<const InvertedLists*> ils;
|
||||
for (int j = i - window_size + 1; j <= i; j++) {
|
||||
if (j < 0 || j >= total_size)
|
||||
continue;
|
||||
ils.push_back(
|
||||
ivflib::extract_index_ivf(sub_indexes[j].get())->invlists);
|
||||
}
|
||||
if (ils.size() == 0)
|
||||
continue;
|
||||
|
||||
ConcatenatedInvertedLists* ci =
|
||||
new ConcatenatedInvertedLists(ils.size(), ils.data());
|
||||
|
||||
// will be deleted by the index
|
||||
index_ivf->replace_invlists(ci, true);
|
||||
|
||||
auto new_res = search_index(index.get(), xq.data());
|
||||
|
||||
std::unique_ptr<Index> merged_index(
|
||||
make_merged_index(trained_index.get(), sub_indexes, i));
|
||||
|
||||
auto ref_res = search_index(merged_index.get(), xq.data());
|
||||
|
||||
EXPECT_EQ(ref_res.size(), new_res.size());
|
||||
EXPECT_EQ(ref_res, new_res);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Test entry points
|
||||
*************************************************************/
|
||||
|
||||
TEST(SlidingWindow, IVFFlat) {
|
||||
test_sliding_window("IVF32,Flat");
|
||||
}
|
||||
|
||||
TEST(SlidingWindow, PCAIVFFlat) {
|
||||
test_sliding_window("PCA24,IVF32,Flat");
|
||||
}
|
||||
|
||||
TEST(SlidingInvlists, IVFFlat) {
|
||||
test_sliding_invlists("IVF32,Flat");
|
||||
}
|
||||
|
||||
TEST(SlidingInvlists, PCAIVFFlat) {
|
||||
test_sliding_invlists("PCA24,IVF32,Flat");
|
||||
}
|
||||
260
packages/leann-backend-hnsw/third_party/faiss/tests/test_threaded_index.cpp
vendored
Normal file
260
packages/leann-backend-hnsw/third_party/faiss/tests/test_threaded_index.cpp
vendored
Normal file
@@ -0,0 +1,260 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <faiss/IndexReplicas.h>
|
||||
#include <faiss/IndexShards.h>
|
||||
#include <faiss/impl/ThreadedIndex.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
struct TestException : public std::exception {};
|
||||
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
struct MockIndex : public faiss::Index {
|
||||
explicit MockIndex(idx_t d) : faiss::Index(d) {
|
||||
resetMock();
|
||||
}
|
||||
|
||||
void resetMock() {
|
||||
flag = false;
|
||||
nCalled = 0;
|
||||
xCalled = nullptr;
|
||||
kCalled = 0;
|
||||
distancesCalled = nullptr;
|
||||
labelsCalled = nullptr;
|
||||
}
|
||||
|
||||
void add(idx_t n, const float* x) override {
|
||||
nCalled = n;
|
||||
xCalled = x;
|
||||
}
|
||||
|
||||
void search(
|
||||
idx_t n,
|
||||
const float* x,
|
||||
idx_t k,
|
||||
float* distances,
|
||||
idx_t* labels,
|
||||
const faiss::SearchParameters* params) const override {
|
||||
FAISS_THROW_IF_NOT(!params);
|
||||
nCalled = n;
|
||||
xCalled = x;
|
||||
kCalled = k;
|
||||
distancesCalled = distances;
|
||||
labelsCalled = labels;
|
||||
}
|
||||
|
||||
void reset() override {}
|
||||
|
||||
bool flag;
|
||||
|
||||
mutable idx_t nCalled;
|
||||
mutable const float* xCalled;
|
||||
mutable idx_t kCalled;
|
||||
mutable float* distancesCalled;
|
||||
mutable idx_t* labelsCalled;
|
||||
};
|
||||
|
||||
template <typename IndexT>
|
||||
struct MockThreadedIndex : public faiss::ThreadedIndex<IndexT> {
|
||||
using idx_t = faiss::idx_t;
|
||||
|
||||
explicit MockThreadedIndex(bool threaded)
|
||||
: faiss::ThreadedIndex<IndexT>(threaded) {}
|
||||
|
||||
void add(idx_t, const float*) override {}
|
||||
void search(
|
||||
idx_t,
|
||||
const float*,
|
||||
idx_t,
|
||||
float*,
|
||||
idx_t*,
|
||||
const faiss::SearchParameters*) const override {}
|
||||
void reset() override {}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(ThreadedIndex, SingleException) {
|
||||
std::vector<std::unique_ptr<MockIndex>> idxs;
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
idxs.emplace_back(new MockIndex(1));
|
||||
}
|
||||
|
||||
auto fn = [](int i, MockIndex* index) {
|
||||
if (i == 1) {
|
||||
throw TestException();
|
||||
} else {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
|
||||
|
||||
index->flag = true;
|
||||
}
|
||||
};
|
||||
|
||||
// Try with threading and without
|
||||
for (bool threaded : {true, false}) {
|
||||
// clear flags
|
||||
for (auto& idx : idxs) {
|
||||
idx->resetMock();
|
||||
}
|
||||
|
||||
MockThreadedIndex<MockIndex> ti(threaded);
|
||||
for (auto& idx : idxs) {
|
||||
ti.addIndex(idx.get());
|
||||
}
|
||||
|
||||
// The second index should throw
|
||||
EXPECT_THROW(ti.runOnIndex(fn), TestException);
|
||||
|
||||
// Index 0 and 2 should have processed
|
||||
EXPECT_TRUE(idxs[0]->flag);
|
||||
EXPECT_TRUE(idxs[2]->flag);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ThreadedIndex, MultipleException) {
|
||||
std::vector<std::unique_ptr<MockIndex>> idxs;
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
idxs.emplace_back(new MockIndex(1));
|
||||
}
|
||||
|
||||
auto fn = [](int i, MockIndex* index) {
|
||||
if (i < 2) {
|
||||
throw TestException();
|
||||
} else {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(i * 250));
|
||||
|
||||
index->flag = true;
|
||||
}
|
||||
};
|
||||
|
||||
// Try with threading and without
|
||||
for (bool threaded : {true, false}) {
|
||||
// clear flags
|
||||
for (auto& idx : idxs) {
|
||||
idx->resetMock();
|
||||
}
|
||||
|
||||
MockThreadedIndex<MockIndex> ti(threaded);
|
||||
for (auto& idx : idxs) {
|
||||
ti.addIndex(idx.get());
|
||||
}
|
||||
|
||||
// Multiple indices threw an exception that was aggregated into a
|
||||
// FaissException
|
||||
EXPECT_THROW(ti.runOnIndex(fn), faiss::FaissException);
|
||||
|
||||
// Index 2 should have processed
|
||||
EXPECT_TRUE(idxs[2]->flag);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ThreadedIndex, TestReplica) {
|
||||
int numReplicas = 5;
|
||||
int n = 10 * numReplicas;
|
||||
int d = 3;
|
||||
int k = 6;
|
||||
|
||||
// Try with threading and without
|
||||
for ([[maybe_unused]] const bool threaded : {true, false}) {
|
||||
std::vector<std::unique_ptr<MockIndex>> idxs;
|
||||
faiss::IndexReplicas replica(d);
|
||||
|
||||
for (int i = 0; i < numReplicas; ++i) {
|
||||
idxs.emplace_back(new MockIndex(d));
|
||||
replica.addIndex(idxs.back().get());
|
||||
}
|
||||
|
||||
std::vector<float> x(n * d);
|
||||
std::vector<float> distances(n * k);
|
||||
std::vector<faiss::idx_t> labels(n * k);
|
||||
|
||||
replica.add(n, x.data());
|
||||
|
||||
for (int i = 0; i < idxs.size(); ++i) {
|
||||
EXPECT_EQ(idxs[i]->nCalled, n);
|
||||
EXPECT_EQ(idxs[i]->xCalled, x.data());
|
||||
}
|
||||
|
||||
for (auto& idx : idxs) {
|
||||
idx->resetMock();
|
||||
}
|
||||
|
||||
replica.search(n, x.data(), k, distances.data(), labels.data());
|
||||
|
||||
for (int i = 0; i < idxs.size(); ++i) {
|
||||
auto perReplica = n / idxs.size();
|
||||
|
||||
EXPECT_EQ(idxs[i]->nCalled, perReplica);
|
||||
EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perReplica * d);
|
||||
EXPECT_EQ(idxs[i]->kCalled, k);
|
||||
EXPECT_EQ(
|
||||
idxs[i]->distancesCalled,
|
||||
distances.data() + (i * perReplica) * k);
|
||||
EXPECT_EQ(
|
||||
idxs[i]->labelsCalled,
|
||||
labels.data() + (i * perReplica) * k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ThreadedIndex, TestShards) {
|
||||
int numShards = 7;
|
||||
int d = 3;
|
||||
int n = 10 * numShards;
|
||||
int k = 6;
|
||||
|
||||
// Try with threading and without
|
||||
for (bool threaded : {true, false}) {
|
||||
std::vector<std::unique_ptr<MockIndex>> idxs;
|
||||
faiss::IndexShards shards(d, threaded);
|
||||
|
||||
for (int i = 0; i < numShards; ++i) {
|
||||
idxs.emplace_back(new MockIndex(d));
|
||||
shards.addIndex(idxs.back().get());
|
||||
}
|
||||
|
||||
std::vector<float> x(n * d);
|
||||
std::vector<float> distances(n * k);
|
||||
std::vector<faiss::idx_t> labels(n * k);
|
||||
|
||||
shards.add(n, x.data());
|
||||
|
||||
for (int i = 0; i < idxs.size(); ++i) {
|
||||
auto perShard = n / idxs.size();
|
||||
|
||||
EXPECT_EQ(idxs[i]->nCalled, perShard);
|
||||
EXPECT_EQ(idxs[i]->xCalled, x.data() + i * perShard * d);
|
||||
}
|
||||
|
||||
for (auto& idx : idxs) {
|
||||
idx->resetMock();
|
||||
}
|
||||
|
||||
shards.search(n, x.data(), k, distances.data(), labels.data());
|
||||
|
||||
for (int i = 0; i < idxs.size(); ++i) {
|
||||
EXPECT_EQ(idxs[i]->nCalled, n);
|
||||
EXPECT_EQ(idxs[i]->xCalled, x.data());
|
||||
EXPECT_EQ(idxs[i]->kCalled, k);
|
||||
// There is a temporary buffer used for shards
|
||||
EXPECT_EQ(
|
||||
idxs[i]->distancesCalled,
|
||||
idxs[0]->distancesCalled + i * k * n);
|
||||
EXPECT_EQ(idxs[i]->labelsCalled, idxs[0]->labelsCalled + i * k * n);
|
||||
}
|
||||
}
|
||||
}
|
||||
147
packages/leann-backend-hnsw/third_party/faiss/tests/test_transfer_invlists.cpp
vendored
Normal file
147
packages/leann-backend-hnsw/third_party/faiss/tests/test_transfer_invlists.cpp
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/clone_index.h>
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/index_factory.h>
|
||||
#include <faiss/index_io.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// parameters to use for the test
|
||||
int d = 64;
|
||||
size_t nb = 1000;
|
||||
size_t nq = 100;
|
||||
size_t nt = 500;
|
||||
int k = 10;
|
||||
int nlist = 40;
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
typedef faiss::idx_t idx_t;
|
||||
|
||||
std::vector<float> get_data(size_t nb, int seed) {
|
||||
std::vector<float> x(nb * d);
|
||||
float_randn(x.data(), nb * d, seed);
|
||||
return x;
|
||||
}
|
||||
|
||||
void test_index_type(const char* factory_string) {
|
||||
// transfer inverted lists in nslice slices
|
||||
int nslice = 3;
|
||||
|
||||
/****************************************************************
|
||||
* trained reference index
|
||||
****************************************************************/
|
||||
|
||||
std::unique_ptr<Index> trained(index_factory(d, factory_string));
|
||||
|
||||
{
|
||||
auto xt = get_data(nt, 123);
|
||||
trained->train(nt, xt.data());
|
||||
}
|
||||
|
||||
// sample nq query vectors to check if results are the same
|
||||
auto xq = get_data(nq, 818);
|
||||
|
||||
/****************************************************************
|
||||
* source index
|
||||
***************************************************************/
|
||||
std::unique_ptr<Index> src_index(clone_index(trained.get()));
|
||||
|
||||
{ // add some data to source index
|
||||
auto xb = get_data(nb, 245);
|
||||
src_index->add(nb, xb.data());
|
||||
}
|
||||
|
||||
ParameterSpace().set_index_parameter(src_index.get(), "nprobe", 4);
|
||||
|
||||
// remember reference search result on source index
|
||||
std::vector<idx_t> Iref(nq * k);
|
||||
std::vector<float> Dref(nq * k);
|
||||
src_index->search(nq, xq.data(), k, Dref.data(), Iref.data());
|
||||
|
||||
/****************************************************************
|
||||
* destination index -- should be replaced by source index
|
||||
***************************************************************/
|
||||
|
||||
std::unique_ptr<Index> dst_index(clone_index(trained.get()));
|
||||
|
||||
{ // initial state: filled in with some garbage
|
||||
int nb2 = nb + 10;
|
||||
auto xb = get_data(nb2, 366);
|
||||
dst_index->add(nb2, xb.data());
|
||||
}
|
||||
|
||||
std::vector<idx_t> Inew(nq * k);
|
||||
std::vector<float> Dnew(nq * k);
|
||||
|
||||
ParameterSpace().set_index_parameter(dst_index.get(), "nprobe", 4);
|
||||
|
||||
// transfer from source to destination in nslice slices
|
||||
for (int sl = 0; sl < nslice; sl++) {
|
||||
// so far, the indexes are different
|
||||
dst_index->search(nq, xq.data(), k, Dnew.data(), Inew.data());
|
||||
EXPECT_TRUE(Iref != Inew);
|
||||
EXPECT_TRUE(Dref != Dnew);
|
||||
|
||||
// range of inverted list indices to transfer
|
||||
long i0 = sl * nlist / nslice;
|
||||
long i1 = (sl + 1) * nlist / nslice;
|
||||
|
||||
std::vector<uint8_t> data_to_transfer;
|
||||
{
|
||||
std::unique_ptr<ArrayInvertedLists> il(
|
||||
ivflib::get_invlist_range(src_index.get(), i0, i1));
|
||||
// serialize inverted lists
|
||||
VectorIOWriter wr;
|
||||
write_InvertedLists(il.get(), &wr);
|
||||
data_to_transfer.swap(wr.data);
|
||||
}
|
||||
|
||||
// transfer data here from source machine to dest machine
|
||||
|
||||
{
|
||||
VectorIOReader reader;
|
||||
reader.data.swap(data_to_transfer);
|
||||
|
||||
// deserialize inverted lists
|
||||
std::unique_ptr<ArrayInvertedLists> il(
|
||||
dynamic_cast<ArrayInvertedLists*>(
|
||||
read_InvertedLists(&reader)));
|
||||
|
||||
// swap inverted lists. Block searches here!
|
||||
{ ivflib::set_invlist_range(dst_index.get(), i0, i1, il.get()); }
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(dst_index->ntotal, src_index->ntotal);
|
||||
|
||||
// now, the indexes are the same
|
||||
dst_index->search(nq, xq.data(), k, Dnew.data(), Inew.data());
|
||||
EXPECT_TRUE(Iref == Inew);
|
||||
EXPECT_TRUE(Dref == Dnew);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(TRANS, IVFFlat) {
|
||||
test_index_type("IVF40,Flat");
|
||||
}
|
||||
|
||||
TEST(TRANS, IVFFlatPreproc) {
|
||||
test_index_type("PCAR32,IVF40,Flat");
|
||||
}
|
||||
38
packages/leann-backend-hnsw/third_party/faiss/tests/test_util.h
vendored
Normal file
38
packages/leann-backend-hnsw/third_party/faiss/tests/test_util.h
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#ifndef FAISS_TEST_UTIL_H
|
||||
#define FAISS_TEST_UTIL_H
|
||||
|
||||
#include <faiss/IndexIVFPQ.h>
|
||||
#include <unistd.h>
|
||||
|
||||
struct Tempfilename {
|
||||
pthread_mutex_t* mutex;
|
||||
std::string filename;
|
||||
|
||||
Tempfilename(pthread_mutex_t* mutex, std::string filename_template) {
|
||||
this->mutex = mutex;
|
||||
this->filename = filename_template;
|
||||
pthread_mutex_lock(mutex);
|
||||
int fd = mkstemp(&this->filename[0]);
|
||||
close(fd);
|
||||
pthread_mutex_unlock(mutex);
|
||||
}
|
||||
|
||||
~Tempfilename() {
|
||||
if (access(filename.c_str(), F_OK)) {
|
||||
unlink(filename.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
const char* c_str() {
|
||||
return filename.c_str();
|
||||
}
|
||||
};
|
||||
|
||||
#endif // FAISS_TEST_UTIL_H
|
||||
19
packages/leann-backend-hnsw/third_party/faiss/tests/test_utils.cpp
vendored
Normal file
19
packages/leann-backend-hnsw/third_party/faiss/tests/test_utils.cpp
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <faiss/Index.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
TEST(TestUtils, get_version) {
|
||||
std::string version = std::to_string(FAISS_VERSION_MAJOR) + "." +
|
||||
std::to_string(FAISS_VERSION_MINOR) + "." +
|
||||
std::to_string(FAISS_VERSION_PATCH);
|
||||
|
||||
EXPECT_EQ(version, faiss::get_version());
|
||||
}
|
||||
243
packages/leann-backend-hnsw/third_party/faiss/tests/test_zerocopy.cpp
vendored
Normal file
243
packages/leann-backend-hnsw/third_party/faiss/tests/test_zerocopy.cpp
vendored
Normal file
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include <faiss/IndexBinaryFlat.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/impl/zerocopy_io.h>
|
||||
#include <faiss/index_io.h>
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<float> make_data(const size_t n, const size_t d, size_t seed) {
|
||||
std::vector<float> database(n * d);
|
||||
std::mt19937 rng(seed);
|
||||
std::uniform_real_distribution<float> distrib;
|
||||
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> make_binary_data(
|
||||
const size_t n,
|
||||
const size_t d,
|
||||
size_t seed) {
|
||||
std::vector<uint8_t> database(n * d);
|
||||
std::mt19937 rng(seed);
|
||||
std::uniform_int_distribution<uint8_t> distrib(0, 255);
|
||||
|
||||
for (size_t i = 0; i < n * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
return database;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// the logic is the following:
|
||||
// 1. generate two flatcodes-based indices, Index1 and Index2
|
||||
// 2. serialize both indices into std::vector<> buffers, Buf1 and Buf2
|
||||
// 3. deserialize Index1 using zero-copy feature on Buf1 into Index1ZC
|
||||
// 4. ensure that Index1ZC acts as Index2 if we write the data from Buf2
|
||||
// on top of the existing Buf1
|
||||
|
||||
TEST(TestZeroCopy, zerocopy_flatcodes) {
|
||||
// generate data
|
||||
const size_t nt = 1000;
|
||||
const size_t nq = 10;
|
||||
const size_t d = 32;
|
||||
const size_t k = 25;
|
||||
|
||||
std::vector<float> xt1 = make_data(nt, d, 123);
|
||||
std::vector<float> xt2 = make_data(nt, d, 456);
|
||||
std::vector<float> xq = make_data(nq, d, 789);
|
||||
|
||||
// ensure that the data is different
|
||||
ASSERT_NE(xt1, xt2);
|
||||
|
||||
// make index1 and create reference results
|
||||
faiss::IndexFlatL2 index1(d);
|
||||
index1.train(nt, xt1.data());
|
||||
index1.add(nt, xt1.data());
|
||||
|
||||
std::vector<float> ref_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_1(k * nq);
|
||||
index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
|
||||
|
||||
// make index2 and create reference results
|
||||
faiss::IndexFlatL2 index2(d);
|
||||
index2.train(nt, xt2.data());
|
||||
index2.add(nt, xt2.data());
|
||||
|
||||
std::vector<float> ref_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_2(k * nq);
|
||||
index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
|
||||
|
||||
// ensure that the results are different
|
||||
ASSERT_NE(ref_dis_1, ref_dis_2);
|
||||
ASSERT_NE(ref_ids_1, ref_ids_2);
|
||||
|
||||
// serialize both in a form of vectors
|
||||
faiss::VectorIOWriter wr1;
|
||||
faiss::write_index(&index1, &wr1);
|
||||
|
||||
faiss::VectorIOWriter wr2;
|
||||
faiss::write_index(&index2, &wr2);
|
||||
|
||||
ASSERT_EQ(wr1.data.size(), wr2.data.size());
|
||||
|
||||
// clone a buffer
|
||||
std::vector<uint8_t> buffer = wr1.data;
|
||||
|
||||
// create a zero-copy index
|
||||
faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
|
||||
std::unique_ptr<faiss::Index> index1zc(faiss::read_index(&reader));
|
||||
|
||||
ASSERT_NE(index1zc, nullptr);
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_1(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_1);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_1);
|
||||
|
||||
// overwrite buffer without moving it
|
||||
for (size_t i = 0; i < buffer.size(); i++) {
|
||||
buffer[i] = wr2.data[i];
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_2(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
|
||||
|
||||
// match vs ref2
|
||||
ASSERT_EQ(ref_ids_2, cand_ids_2);
|
||||
ASSERT_EQ(ref_dis_2, cand_dis_2);
|
||||
|
||||
// overwrite again
|
||||
for (size_t i = 0; i < buffer.size(); i++) {
|
||||
buffer[i] = wr1.data[i];
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<float> cand_dis_3(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_3(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_3);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_3);
|
||||
}
|
||||
|
||||
TEST(TestZeroCopy, zerocopy_binary_flatcodes) {
|
||||
// generate data
|
||||
const size_t nt = 1000;
|
||||
const size_t nq = 10;
|
||||
// in bits
|
||||
const size_t d = 64;
|
||||
// in bytes
|
||||
const size_t d8 = (d + 7) / 8;
|
||||
const size_t k = 25;
|
||||
|
||||
std::vector<uint8_t> xt1 = make_binary_data(nt, d8, 123);
|
||||
std::vector<uint8_t> xt2 = make_binary_data(nt, d8, 456);
|
||||
std::vector<uint8_t> xq = make_binary_data(nq, d8, 789);
|
||||
|
||||
// ensure that the data is different
|
||||
ASSERT_NE(xt1, xt2);
|
||||
|
||||
// make index1 and create reference results
|
||||
faiss::IndexBinaryFlat index1(d);
|
||||
index1.train(nt, xt1.data());
|
||||
index1.add(nt, xt1.data());
|
||||
|
||||
std::vector<int32_t> ref_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_1(k * nq);
|
||||
index1.search(nq, xq.data(), k, ref_dis_1.data(), ref_ids_1.data());
|
||||
|
||||
// make index2 and create reference results
|
||||
faiss::IndexBinaryFlat index2(d);
|
||||
index2.train(nt, xt2.data());
|
||||
index2.add(nt, xt2.data());
|
||||
|
||||
std::vector<int32_t> ref_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> ref_ids_2(k * nq);
|
||||
index2.search(nq, xq.data(), k, ref_dis_2.data(), ref_ids_2.data());
|
||||
|
||||
// ensure that the results are different
|
||||
ASSERT_NE(ref_dis_1, ref_dis_2);
|
||||
ASSERT_NE(ref_ids_1, ref_ids_2);
|
||||
|
||||
// serialize both in a form of vectors
|
||||
faiss::VectorIOWriter wr1;
|
||||
faiss::write_index_binary(&index1, &wr1);
|
||||
|
||||
faiss::VectorIOWriter wr2;
|
||||
faiss::write_index_binary(&index2, &wr2);
|
||||
|
||||
ASSERT_EQ(wr1.data.size(), wr2.data.size());
|
||||
|
||||
// clone a buffer
|
||||
std::vector<uint8_t> buffer = wr1.data;
|
||||
|
||||
// create a zero-copy index
|
||||
faiss::ZeroCopyIOReader reader(buffer.data(), buffer.size());
|
||||
std::unique_ptr<faiss::IndexBinary> index1zc(
|
||||
faiss::read_index_binary(&reader));
|
||||
|
||||
ASSERT_NE(index1zc, nullptr);
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_1(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_1(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_1.data(), cand_ids_1.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_1);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_1);
|
||||
|
||||
// overwrite buffer without moving it
|
||||
for (size_t i = 0; i < buffer.size(); i++) {
|
||||
buffer[i] = wr2.data[i];
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_2(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_2(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_2.data(), cand_ids_2.data());
|
||||
|
||||
// match vs ref2
|
||||
ASSERT_EQ(ref_ids_2, cand_ids_2);
|
||||
ASSERT_EQ(ref_dis_2, cand_dis_2);
|
||||
|
||||
// overwrite again
|
||||
for (size_t i = 0; i < buffer.size(); i++) {
|
||||
buffer[i] = wr1.data[i];
|
||||
}
|
||||
|
||||
// perform a search
|
||||
std::vector<int32_t> cand_dis_3(k * nq);
|
||||
std::vector<faiss::idx_t> cand_ids_3(k * nq);
|
||||
index1zc->search(nq, xq.data(), k, cand_dis_3.data(), cand_ids_3.data());
|
||||
|
||||
// match vs ref1
|
||||
ASSERT_EQ(ref_ids_1, cand_ids_3);
|
||||
ASSERT_EQ(ref_dis_1, cand_dis_3);
|
||||
}
|
||||
427
packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_contrib.py
vendored
Normal file
427
packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_contrib.py
vendored
Normal file
@@ -0,0 +1,427 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import torch # usort: skip
|
||||
import unittest # usort: skip
|
||||
import numpy as np # usort: skip
|
||||
|
||||
import faiss # usort: skip
|
||||
import faiss.contrib.torch_utils # usort: skip
|
||||
from faiss.contrib import datasets
|
||||
from faiss.contrib.torch import clustering, quantization
|
||||
|
||||
|
||||
|
||||
|
||||
class TestTorchUtilsCPU(unittest.TestCase):
|
||||
# tests add, search
|
||||
def test_lookup(self):
|
||||
d = 128
|
||||
index = faiss.IndexFlatL2(d)
|
||||
|
||||
# Add to CPU index with torch CPU
|
||||
xb_torch = torch.rand(10000, d)
|
||||
index.add(xb_torch)
|
||||
|
||||
# Test reconstruct
|
||||
y_torch = index.reconstruct(10)
|
||||
self.assertTrue(torch.equal(y_torch, xb_torch[10]))
|
||||
|
||||
# Add to CPU index with numpy CPU
|
||||
xb_np = torch.rand(500, d).numpy()
|
||||
index.add(xb_np)
|
||||
self.assertEqual(index.ntotal, 10500)
|
||||
|
||||
y_np = np.zeros(d, dtype=np.float32)
|
||||
index.reconstruct(10100, y_np)
|
||||
self.assertTrue(np.array_equal(y_np, xb_np[100]))
|
||||
|
||||
# Search with np cpu
|
||||
xq_torch = torch.rand(10, d, dtype=torch.float32)
|
||||
d_np, I_np = index.search(xq_torch.numpy(), 5)
|
||||
|
||||
# Search with torch cpu
|
||||
d_torch, I_torch = index.search(xq_torch, 5)
|
||||
|
||||
# The two should be equivalent
|
||||
self.assertTrue(np.array_equal(d_np, d_torch.numpy()))
|
||||
self.assertTrue(np.array_equal(I_np, I_torch.numpy()))
|
||||
|
||||
# Search with np cpu using pre-allocated arrays
|
||||
d_np_input = np.zeros((10, 5), dtype=np.float32)
|
||||
I_np_input = np.zeros((10, 5), dtype=np.int64)
|
||||
index.search(xq_torch.numpy(), 5, d_np_input, I_np_input)
|
||||
|
||||
self.assertTrue(np.array_equal(d_np, d_np_input))
|
||||
self.assertTrue(np.array_equal(I_np, I_np_input))
|
||||
|
||||
# Search with torch cpu using pre-allocated arrays
|
||||
d_torch_input = torch.zeros(10, 5, dtype=torch.float32)
|
||||
I_torch_input = torch.zeros(10, 5, dtype=torch.int64)
|
||||
index.search(xq_torch, 5, d_torch_input, I_torch_input)
|
||||
|
||||
self.assertTrue(np.array_equal(d_torch_input.numpy(), d_np))
|
||||
self.assertTrue(np.array_equal(I_torch_input.numpy(), I_np))
|
||||
|
||||
# tests train, add_with_ids
|
||||
def test_train_add_with_ids(self):
|
||||
d = 32
|
||||
nlist = 5
|
||||
|
||||
quantizer = faiss.IndexFlatL2(d)
|
||||
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
index.train(xb)
|
||||
|
||||
# Test add_with_ids with torch cpu
|
||||
ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
|
||||
index.add_with_ids(xb, ids)
|
||||
_, I = index.search(xb[10:20], 1)
|
||||
self.assertTrue(torch.equal(I.view(10), ids[10:20]))
|
||||
|
||||
# Test add_with_ids with numpy
|
||||
index.reset()
|
||||
index.train(xb.numpy())
|
||||
index.add_with_ids(xb.numpy(), ids.numpy())
|
||||
_, I = index.search(xb.numpy()[10:20], 1)
|
||||
self.assertTrue(np.array_equal(I.reshape(10), ids.numpy()[10:20]))
|
||||
|
||||
# tests reconstruct, reconstruct_n
|
||||
def test_reconstruct(self):
|
||||
d = 32
|
||||
index = faiss.IndexFlatL2(d)
|
||||
|
||||
xb = torch.rand(100, d, dtype=torch.float32)
|
||||
index.add(xb)
|
||||
|
||||
# Test reconstruct with torch cpu (native return)
|
||||
y = index.reconstruct(7)
|
||||
self.assertTrue(torch.equal(xb[7], y))
|
||||
|
||||
# Test reconstruct with numpy output provided
|
||||
y = np.empty(d, dtype=np.float32)
|
||||
index.reconstruct(11, y)
|
||||
self.assertTrue(np.array_equal(xb.numpy()[11], y))
|
||||
|
||||
# Test reconstruct with torch cpu output providesd
|
||||
y = torch.empty(d, dtype=torch.float32)
|
||||
index.reconstruct(12, y)
|
||||
self.assertTrue(torch.equal(xb[12], y))
|
||||
|
||||
# Test reconstruct_n with torch cpu (native return)
|
||||
y = index.reconstruct_n(10, 10)
|
||||
self.assertTrue(torch.equal(xb[10:20], y))
|
||||
|
||||
# Test reconstruct with numpy output provided
|
||||
y = np.empty((10, d), dtype=np.float32)
|
||||
index.reconstruct_n(20, 10, y)
|
||||
self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
|
||||
|
||||
# Test reconstruct_n with torch cpu output provided
|
||||
y = torch.empty(10, d, dtype=torch.float32)
|
||||
index.reconstruct_n(40, 10, y)
|
||||
self.assertTrue(torch.equal(xb[40:50].cpu(), y))
|
||||
|
||||
# tests assign
|
||||
def test_assign(self):
|
||||
d = 32
|
||||
index = faiss.IndexFlatL2(d)
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
index.add(xb)
|
||||
|
||||
index_ref = faiss.IndexFlatL2(d)
|
||||
index_ref.add(xb.numpy())
|
||||
|
||||
# Test assign with native cpu output
|
||||
xq = torch.rand(10, d, dtype=torch.float32)
|
||||
labels = index.assign(xq, 5)
|
||||
labels_ref = index_ref.assign(xq.cpu(), 5)
|
||||
|
||||
self.assertTrue(torch.equal(labels, labels_ref))
|
||||
|
||||
# Test assign with np input
|
||||
labels = index.assign(xq.numpy(), 5)
|
||||
labels_ref = index_ref.assign(xq.numpy(), 5)
|
||||
self.assertTrue(np.array_equal(labels, labels_ref))
|
||||
|
||||
# Test assign with numpy output provided
|
||||
labels = np.empty((xq.shape[0], 5), dtype='int64')
|
||||
index.assign(xq.numpy(), 5, labels)
|
||||
self.assertTrue(np.array_equal(labels, labels_ref))
|
||||
|
||||
# Test assign with torch cpu output provided
|
||||
labels = torch.empty(xq.shape[0], 5, dtype=torch.int64)
|
||||
index.assign(xq, 5, labels)
|
||||
labels_ref = index_ref.assign(xq, 5)
|
||||
self.assertTrue(torch.equal(labels, labels_ref))
|
||||
|
||||
# tests remove_ids
|
||||
def test_remove_ids(self):
|
||||
# only implemented for cpu index + numpy at the moment
|
||||
d = 32
|
||||
quantizer = faiss.IndexFlatL2(d)
|
||||
index = faiss.IndexIVFFlat(quantizer, d, 5)
|
||||
index.make_direct_map()
|
||||
index.set_direct_map_type(faiss.DirectMap.Hashtable)
|
||||
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
|
||||
index.train(xb)
|
||||
index.add_with_ids(xb, ids)
|
||||
|
||||
ids_remove = np.array([1010], dtype=np.int64)
|
||||
index.remove_ids(ids_remove)
|
||||
|
||||
# We should find this
|
||||
y = index.reconstruct(1011)
|
||||
self.assertTrue(np.array_equal(xb[11].numpy(), y))
|
||||
|
||||
# We should not find this
|
||||
with self.assertRaises(RuntimeError):
|
||||
y = index.reconstruct(1010)
|
||||
|
||||
# Torch not yet supported
|
||||
ids_remove = torch.tensor([1012], dtype=torch.int64)
|
||||
with self.assertRaises(AssertionError):
|
||||
index.remove_ids(ids_remove)
|
||||
|
||||
# tests update_vectors
|
||||
def test_update_vectors(self):
|
||||
d = 32
|
||||
quantizer_np = faiss.IndexFlatL2(d)
|
||||
index_np = faiss.IndexIVFFlat(quantizer_np, d, 5)
|
||||
index_np.make_direct_map()
|
||||
index_np.set_direct_map_type(faiss.DirectMap.Hashtable)
|
||||
|
||||
quantizer_torch = faiss.IndexFlatL2(d)
|
||||
index_torch = faiss.IndexIVFFlat(quantizer_torch, d, 5)
|
||||
index_torch.make_direct_map()
|
||||
index_torch.set_direct_map_type(faiss.DirectMap.Hashtable)
|
||||
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
ids = torch.arange(1000, 1000 + xb.shape[0], dtype=torch.int64)
|
||||
|
||||
index_np.train(xb.numpy())
|
||||
index_np.add_with_ids(xb.numpy(), ids.numpy())
|
||||
|
||||
index_torch.train(xb)
|
||||
index_torch.add_with_ids(xb, ids)
|
||||
|
||||
xb_up = torch.rand(10, d, dtype=torch.float32)
|
||||
ids_up = ids[0:10]
|
||||
|
||||
index_np.update_vectors(ids_up.numpy(), xb_up.numpy())
|
||||
index_torch.update_vectors(ids_up, xb_up)
|
||||
|
||||
xq = torch.rand(10, d, dtype=torch.float32)
|
||||
|
||||
D_np, I_np = index_np.search(xq.numpy(), 5)
|
||||
D_torch, I_torch = index_torch.search(xq, 5)
|
||||
|
||||
self.assertTrue(np.array_equal(D_np, D_torch.numpy()))
|
||||
self.assertTrue(np.array_equal(I_np, I_torch.numpy()))
|
||||
|
||||
# tests range_search
|
||||
def test_range_search(self):
|
||||
torch.manual_seed(10)
|
||||
d = 32
|
||||
index = faiss.IndexFlatL2(d)
|
||||
xb = torch.rand(100, d, dtype=torch.float32)
|
||||
index.add(xb)
|
||||
|
||||
# torch cpu as ground truth
|
||||
thresh = 2.9
|
||||
xq = torch.rand(10, d, dtype=torch.float32)
|
||||
lims, D, I = index.range_search(xq, thresh)
|
||||
|
||||
# compare against np
|
||||
lims_np, D_np, I_np = index.range_search(xq.numpy(), thresh)
|
||||
|
||||
self.assertTrue(np.array_equal(lims.numpy(), lims_np))
|
||||
self.assertTrue(np.array_equal(D.numpy(), D_np))
|
||||
self.assertTrue(np.array_equal(I.numpy(), I_np))
|
||||
|
||||
# tests search_and_reconstruct
|
||||
def test_search_and_reconstruct(self):
|
||||
d = 32
|
||||
nlist = 10
|
||||
M = 4
|
||||
k = 5
|
||||
quantizer = faiss.IndexFlatL2(d)
|
||||
index = faiss.IndexIVFPQ(quantizer, d, nlist, M, 4)
|
||||
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
index.train(xb)
|
||||
|
||||
# different set
|
||||
xb = torch.rand(500, d, dtype=torch.float32)
|
||||
index.add(xb)
|
||||
|
||||
# torch cpu as ground truth
|
||||
xq = torch.rand(10, d, dtype=torch.float32)
|
||||
D, I, R = index.search_and_reconstruct(xq, k)
|
||||
|
||||
# compare against numpy
|
||||
D_np, I_np, R_np = index.search_and_reconstruct(xq.numpy(), k)
|
||||
|
||||
self.assertTrue(np.array_equal(D.numpy(), D_np))
|
||||
self.assertTrue(np.array_equal(I.numpy(), I_np))
|
||||
self.assertTrue(np.array_equal(R.numpy(), R_np))
|
||||
|
||||
# numpy input values
|
||||
D_input = np.zeros((xq.shape[0], k), dtype=np.float32)
|
||||
I_input = np.zeros((xq.shape[0], k), dtype=np.int64)
|
||||
R_input = np.zeros((xq.shape[0], k, d), dtype=np.float32)
|
||||
|
||||
index.search_and_reconstruct(xq.numpy(), k, D_input, I_input, R_input)
|
||||
|
||||
self.assertTrue(np.array_equal(D.numpy(), D_input))
|
||||
self.assertTrue(np.array_equal(I.numpy(), I_input))
|
||||
self.assertTrue(np.array_equal(R.numpy(), R_input))
|
||||
|
||||
# torch input values
|
||||
D_input = torch.zeros(xq.shape[0], k, dtype=torch.float32)
|
||||
I_input = torch.zeros(xq.shape[0], k, dtype=torch.int64)
|
||||
R_input = torch.zeros(xq.shape[0], k, d, dtype=torch.float32)
|
||||
|
||||
index.search_and_reconstruct(xq, k, D_input, I_input, R_input)
|
||||
|
||||
self.assertTrue(torch.equal(D, D_input))
|
||||
self.assertTrue(torch.equal(I, I_input))
|
||||
self.assertTrue(torch.equal(R, R_input))
|
||||
|
||||
def test_search_preassigned(self):
|
||||
ds = datasets.SyntheticDataset(32, 1000, 100, 10)
|
||||
index = faiss.index_factory(32, "IVF20,PQ4np")
|
||||
index.train(ds.get_train())
|
||||
index.add(ds.get_database())
|
||||
index.nprobe = 4
|
||||
Dref, Iref = index.search(ds.get_queries(), 10)
|
||||
quantizer = faiss.clone_index(index.quantizer)
|
||||
|
||||
# mutilate the index' quantizer
|
||||
index.quantizer.reset()
|
||||
index.quantizer.add(np.zeros((20, 32), dtype='float32'))
|
||||
|
||||
# test numpy codepath
|
||||
Dq, Iq = quantizer.search(ds.get_queries(), 4)
|
||||
Dref2, Iref2 = index.search_preassigned(ds.get_queries(), 10, Iq, Dq)
|
||||
np.testing.assert_array_equal(Iref, Iref2)
|
||||
np.testing.assert_array_equal(Dref, Dref2)
|
||||
|
||||
# test torch codepath
|
||||
xq = torch.from_numpy(ds.get_queries())
|
||||
Dq, Iq = quantizer.search(xq, 4)
|
||||
Dref2, Iref2 = index.search_preassigned(xq, 10, Iq, Dq)
|
||||
np.testing.assert_array_equal(Iref, Iref2.numpy())
|
||||
np.testing.assert_array_equal(Dref, Dref2.numpy())
|
||||
|
||||
# tests sa_encode, sa_decode
|
||||
def test_sa_encode_decode(self):
|
||||
d = 16
|
||||
index = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_8bit)
|
||||
|
||||
xb = torch.rand(1000, d, dtype=torch.float32)
|
||||
index.train(xb)
|
||||
|
||||
# torch cpu as ground truth
|
||||
nq = 10
|
||||
xq = torch.rand(nq, d, dtype=torch.float32)
|
||||
encoded_torch = index.sa_encode(xq)
|
||||
|
||||
# numpy cpu
|
||||
encoded_np = index.sa_encode(xq.numpy())
|
||||
|
||||
self.assertTrue(np.array_equal(encoded_torch.numpy(), encoded_np))
|
||||
|
||||
decoded_torch = index.sa_decode(encoded_torch)
|
||||
decoded_np = index.sa_decode(encoded_np)
|
||||
|
||||
self.assertTrue(torch.equal(decoded_torch, torch.from_numpy(decoded_np)))
|
||||
|
||||
# torch cpu as output parameter
|
||||
encoded_torch_param = torch.zeros(nq, d, dtype=torch.uint8)
|
||||
index.sa_encode(xq, encoded_torch_param)
|
||||
|
||||
self.assertTrue(torch.equal(encoded_torch, encoded_torch))
|
||||
|
||||
decoded_torch_param = torch.zeros(nq, d, dtype=torch.float32)
|
||||
index.sa_decode(encoded_torch, decoded_torch_param)
|
||||
|
||||
self.assertTrue(torch.equal(decoded_torch, decoded_torch_param))
|
||||
|
||||
# np as output parameter
|
||||
encoded_np_param = np.zeros((nq, d), dtype=np.uint8)
|
||||
index.sa_encode(xq.numpy(), encoded_np_param)
|
||||
|
||||
self.assertTrue(np.array_equal(encoded_torch.numpy(), encoded_np_param))
|
||||
|
||||
decoded_np_param = np.zeros((nq, d), dtype=np.float32)
|
||||
index.sa_decode(encoded_np_param, decoded_np_param)
|
||||
|
||||
self.assertTrue(np.array_equal(decoded_np, decoded_np_param))
|
||||
|
||||
def test_non_contiguous(self):
|
||||
d = 128
|
||||
index = faiss.IndexFlatL2(d)
|
||||
|
||||
xb = torch.rand(d, 100).transpose(0, 1)
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
index.add(xb)
|
||||
|
||||
# disabled since we now accept non-contiguous arrays
|
||||
# with self.assertRaises(ValueError):
|
||||
# index.add(xb.numpy())
|
||||
|
||||
|
||||
class TestClustering(unittest.TestCase):
|
||||
|
||||
def test_python_kmeans(self):
|
||||
""" Test the python implementation of kmeans """
|
||||
ds = datasets.SyntheticDataset(32, 10000, 0, 0)
|
||||
x = ds.get_train()
|
||||
|
||||
# bad distribution to stress-test split code
|
||||
xt = x[:10000].copy()
|
||||
xt[:5000] = x[0]
|
||||
|
||||
km_ref = faiss.Kmeans(ds.d, 100, niter=10)
|
||||
km_ref.train(xt)
|
||||
err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
|
||||
|
||||
xt_torch = torch.from_numpy(xt)
|
||||
data = clustering.DatasetAssign(xt_torch)
|
||||
centroids = clustering.kmeans(100, data, 10)
|
||||
centroids = centroids.numpy()
|
||||
err2 = faiss.knn(xt, centroids, 1)[0].sum()
|
||||
|
||||
# 33498.332 33380.477
|
||||
# print(err, err2) 1/0
|
||||
self.assertLess(err2, err * 1.1)
|
||||
|
||||
|
||||
class TestQuantization(unittest.TestCase):
|
||||
def test_python_product_quantization(self):
|
||||
""" Test the python implementation of product quantization """
|
||||
d = 64
|
||||
n = 10000
|
||||
cs = 4
|
||||
nbits = 8
|
||||
M = 4
|
||||
x = np.random.random(size=(n, d)).astype('float32')
|
||||
pq = faiss.ProductQuantizer(d, cs, nbits)
|
||||
pq.train(x)
|
||||
codes = pq.compute_codes(x)
|
||||
x2 = pq.decode(codes)
|
||||
diff = ((x - x2)**2).sum()
|
||||
# vs pure pytorch impl
|
||||
xt = torch.from_numpy(x)
|
||||
my_pq = quantization.ProductQuantizer(d, M, nbits)
|
||||
my_pq.train(xt)
|
||||
my_codes = my_pq.encode(xt)
|
||||
xt2 = my_pq.decode(my_codes)
|
||||
my_diff = ((xt - xt2)**2).sum()
|
||||
self.assertLess(abs(diff - my_diff), 100)
|
||||
374
packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_neural_net.py
vendored
Normal file
374
packages/leann-backend-hnsw/third_party/faiss/tests/torch_test_neural_net.py
vendored
Normal file
@@ -0,0 +1,374 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import torch # usort: skip
|
||||
from torch import nn # usort: skip
|
||||
import unittest # usort: skip
|
||||
import numpy as np # usort: skip
|
||||
|
||||
import faiss # usort: skip
|
||||
|
||||
from faiss.contrib import datasets # usort: skip
|
||||
from faiss.contrib.inspect_tools import get_additive_quantizer_codebooks # usort: skip
|
||||
|
||||
|
||||
class TestLayer(unittest.TestCase):
|
||||
|
||||
@torch.no_grad()
|
||||
def test_Embedding(self):
|
||||
""" verify that the Faiss Embedding works the same as in Pytorch """
|
||||
torch.manual_seed(123)
|
||||
|
||||
emb = nn.Embedding(40, 50)
|
||||
idx = torch.randint(40, (25, ))
|
||||
ref_batch = emb(idx)
|
||||
|
||||
emb2 = faiss.Embedding(emb)
|
||||
idx2 = faiss.Int32Tensor2D(idx[:, None].to(dtype=torch.int32))
|
||||
new_batch = emb2(idx2)
|
||||
|
||||
new_batch = new_batch.numpy()
|
||||
np.testing.assert_allclose(ref_batch.numpy(), new_batch, atol=2e-6)
|
||||
|
||||
@torch.no_grad()
|
||||
def do_test_Linear(self, bias):
|
||||
""" verify that the Faiss Linear works the same as in Pytorch """
|
||||
torch.manual_seed(123)
|
||||
linear = nn.Linear(50, 40, bias=bias)
|
||||
x = torch.randn(25, 50)
|
||||
ref_y = linear(x)
|
||||
|
||||
linear2 = faiss.Linear(linear)
|
||||
x2 = faiss.Tensor2D(x)
|
||||
y = linear2(x2)
|
||||
np.testing.assert_allclose(ref_y.numpy(), y.numpy(), atol=2e-6)
|
||||
|
||||
def test_Linear(self):
|
||||
self.do_test_Linear(True)
|
||||
|
||||
def test_Linear_nobias(self):
|
||||
self.do_test_Linear(False)
|
||||
|
||||
######################################################
|
||||
# QINCo Pytorch implementation copied from
|
||||
# https://github.com/facebookresearch/Qinco/blob/main/model_qinco.py
|
||||
#
|
||||
# The implementation is copied here to avoid introducting an additional
|
||||
# dependency.
|
||||
######################################################
|
||||
|
||||
|
||||
def pairwise_distances(a, b):
|
||||
anorms = (a**2).sum(-1)
|
||||
bnorms = (b**2).sum(-1)
|
||||
return anorms[:, None] + bnorms - 2 * a @ b.T
|
||||
|
||||
|
||||
def compute_batch_distances(a, b):
|
||||
anorms = (a**2).sum(-1)
|
||||
bnorms = (b**2).sum(-1)
|
||||
return (
|
||||
anorms.unsqueeze(-1) + bnorms.unsqueeze(1) - 2 * torch.bmm(a, b.transpose(2, 1))
|
||||
)
|
||||
|
||||
|
||||
def assign_batch_multiple(x, zqs):
|
||||
bs, d = x.shape
|
||||
bs, K, d = zqs.shape
|
||||
|
||||
L2distances = compute_batch_distances(x.unsqueeze(1), zqs).squeeze(1) # [bs x ksq]
|
||||
idx = torch.argmin(L2distances, dim=1).unsqueeze(1) # [bsx1]
|
||||
quantized = torch.gather(zqs, dim=1, index=idx.unsqueeze(-1).repeat(1, 1, d))
|
||||
return idx.squeeze(1), quantized.squeeze(1)
|
||||
|
||||
|
||||
def assign_to_codebook(x, c, bs=16384):
|
||||
nq, d = x.shape
|
||||
nb, d2 = c.shape
|
||||
assert d == d2
|
||||
if nq * nb < bs * bs:
|
||||
# small enough to represent the whole distance table
|
||||
dis = pairwise_distances(x, c)
|
||||
return dis.argmin(1)
|
||||
|
||||
# otherwise tile computation to avoid OOM
|
||||
res = torch.empty((nq,), dtype=torch.int64, device=x.device)
|
||||
cnorms = (c**2).sum(1)
|
||||
for i in range(0, nq, bs):
|
||||
xnorms = (x[i : i + bs] ** 2).sum(1, keepdim=True)
|
||||
for j in range(0, nb, bs):
|
||||
dis = xnorms + cnorms[j : j + bs] - 2 * x[i : i + bs] @ c[j : j + bs].T
|
||||
dmini, imini = dis.min(1)
|
||||
if j == 0:
|
||||
dmin = dmini
|
||||
imin = imini
|
||||
else:
|
||||
(mask,) = torch.where(dmini < dmin)
|
||||
dmin[mask] = dmini[mask]
|
||||
imin[mask] = imini[mask] + j
|
||||
res[i : i + bs] = imin
|
||||
return res
|
||||
|
||||
|
||||
class QINCoStep(nn.Module):
|
||||
"""
|
||||
One quantization step for QINCo.
|
||||
Contains the codebook, concatenation block, and residual blocks
|
||||
"""
|
||||
|
||||
def __init__(self, d, K, L, h):
|
||||
nn.Module.__init__(self)
|
||||
|
||||
self.d, self.K, self.L, self.h = d, K, L, h
|
||||
|
||||
self.codebook = nn.Embedding(K, d)
|
||||
self.MLPconcat = nn.Linear(2 * d, d)
|
||||
|
||||
self.residual_blocks = []
|
||||
for l in range(L):
|
||||
residual_block = nn.Sequential(
|
||||
nn.Linear(d, h, bias=False), nn.ReLU(), nn.Linear(h, d, bias=False)
|
||||
)
|
||||
self.add_module(f"residual_block{l}", residual_block)
|
||||
self.residual_blocks.append(residual_block)
|
||||
|
||||
def decode(self, xhat, codes):
|
||||
zqs = self.codebook(codes)
|
||||
cc = torch.concatenate((zqs, xhat), 1)
|
||||
zqs = zqs + self.MLPconcat(cc)
|
||||
|
||||
for residual_block in self.residual_blocks:
|
||||
zqs = zqs + residual_block(zqs)
|
||||
|
||||
return zqs
|
||||
|
||||
def encode(self, xhat, x):
|
||||
# we are trying out the whole codebook
|
||||
zqs = self.codebook.weight
|
||||
K, d = zqs.shape
|
||||
bs, d = xhat.shape
|
||||
|
||||
# repeat so that they are of size bs * K
|
||||
zqs_r = zqs.repeat(bs, 1, 1).reshape(bs * K, d)
|
||||
xhat_r = xhat.reshape(bs, 1, d).repeat(1, K, 1).reshape(bs * K, d)
|
||||
|
||||
# pass on batch of size bs * K
|
||||
cc = torch.concatenate((zqs_r, xhat_r), 1)
|
||||
zqs_r = zqs_r + self.MLPconcat(cc)
|
||||
|
||||
for residual_block in self.residual_blocks:
|
||||
zqs_r = zqs_r + residual_block(zqs_r)
|
||||
|
||||
# possible next steps
|
||||
zqs_r = zqs_r.reshape(bs, K, d) + xhat.reshape(bs, 1, d)
|
||||
codes, xhat_next = assign_batch_multiple(x, zqs_r)
|
||||
|
||||
return codes, xhat_next - xhat
|
||||
|
||||
|
||||
class QINCo(nn.Module):
|
||||
"""
|
||||
QINCo quantizer, built from a chain of residual quantization steps
|
||||
"""
|
||||
|
||||
def __init__(self, d, K, L, M, h):
|
||||
nn.Module.__init__(self)
|
||||
|
||||
self.d, self.K, self.L, self.M, self.h = d, K, L, M, h
|
||||
|
||||
self.codebook0 = nn.Embedding(K, d)
|
||||
|
||||
self.steps = []
|
||||
for m in range(1, M):
|
||||
step = QINCoStep(d, K, L, h)
|
||||
self.add_module(f"step{m}", step)
|
||||
self.steps.append(step)
|
||||
|
||||
def decode(self, codes):
|
||||
xhat = self.codebook0(codes[:, 0])
|
||||
for i, step in enumerate(self.steps):
|
||||
xhat = xhat + step.decode(xhat, codes[:, i + 1])
|
||||
return xhat
|
||||
|
||||
def encode(self, x, code0=None):
|
||||
"""
|
||||
Encode a batch of vectors x to codes of length M.
|
||||
If this function is called from IVF-QINCo, codes are 1 index longer,
|
||||
due to the first index being the IVF index, and codebook0 is the IVF codebook.
|
||||
"""
|
||||
M = len(self.steps) + 1
|
||||
bs, d = x.shape
|
||||
codes = torch.zeros(bs, M, dtype=int, device=x.device)
|
||||
|
||||
if code0 is None:
|
||||
# at IVF training time, the code0 is fixed (and precomputed)
|
||||
code0 = assign_to_codebook(x, self.codebook0.weight)
|
||||
|
||||
codes[:, 0] = code0
|
||||
xhat = self.codebook0.weight[code0]
|
||||
|
||||
for i, step in enumerate(self.steps):
|
||||
codes[:, i + 1], toadd = step.encode(xhat, x)
|
||||
xhat = xhat + toadd
|
||||
|
||||
return codes, xhat
|
||||
|
||||
|
||||
######################################################
|
||||
# QINCo tests
|
||||
######################################################
|
||||
|
||||
def copy_QINCoStep(step):
|
||||
step2 = faiss.QINCoStep(step.d, step.K, step.L, step.h)
|
||||
step2.codebook.from_torch(step.codebook)
|
||||
step2.MLPconcat.from_torch(step.MLPconcat)
|
||||
|
||||
for l in range(step.L):
|
||||
src = step.residual_blocks[l]
|
||||
dest = step2.get_residual_block(l)
|
||||
dest.linear1.from_torch(src[0])
|
||||
dest.linear2.from_torch(src[2])
|
||||
return step2
|
||||
|
||||
|
||||
class TestQINCoStep(unittest.TestCase):
|
||||
@torch.no_grad()
|
||||
def test_decode(self):
|
||||
torch.manual_seed(123)
|
||||
step = QINCoStep(d=16, K=20, L=2, h=8)
|
||||
|
||||
codes = torch.randint(0, 20, (10, ))
|
||||
xhat = torch.randn(10, 16)
|
||||
ref_decode = step.decode(xhat, codes)
|
||||
|
||||
# step2 = copy_QINCoStep(step)
|
||||
step2 = faiss.QINCoStep(step)
|
||||
codes2 = faiss.Int32Tensor2D(codes[:, None].to(dtype=torch.int32))
|
||||
|
||||
np.testing.assert_array_equal(
|
||||
step.codebook(codes).numpy(),
|
||||
step2.codebook(codes2).numpy()
|
||||
)
|
||||
|
||||
xhat2 = faiss.Tensor2D(xhat)
|
||||
# xhat2 = faiss.Tensor2D(len(codes), step2.d)
|
||||
|
||||
new_decode = step2.decode(xhat2, codes2)
|
||||
|
||||
np.testing.assert_allclose(
|
||||
ref_decode.numpy(),
|
||||
new_decode.numpy(),
|
||||
atol=2e-6
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def test_encode(self):
|
||||
torch.manual_seed(123)
|
||||
step = QINCoStep(d=16, K=20, L=2, h=8)
|
||||
|
||||
# create plausible x for testing starting from actual codes
|
||||
codes = torch.randint(0, 20, (10, ))
|
||||
xhat = torch.zeros(10, 16)
|
||||
x = step.decode(xhat, codes)
|
||||
del codes
|
||||
ref_codes, toadd = step.encode(xhat, x)
|
||||
|
||||
step2 = copy_QINCoStep(step)
|
||||
xhat2 = faiss.Tensor2D(xhat)
|
||||
x2 = faiss.Tensor2D(x)
|
||||
toadd2 = faiss.Tensor2D(10, 16)
|
||||
|
||||
new_codes = step2.encode(xhat2, x2, toadd2)
|
||||
|
||||
np.testing.assert_allclose(
|
||||
ref_codes.numpy(),
|
||||
new_codes.numpy().ravel(),
|
||||
atol=2e-6
|
||||
)
|
||||
np.testing.assert_allclose(toadd.numpy(), toadd2.numpy(), atol=2e-6)
|
||||
|
||||
|
||||
|
||||
class TestQINCo(unittest.TestCase):
|
||||
|
||||
@torch.no_grad()
|
||||
def test_decode(self):
|
||||
torch.manual_seed(123)
|
||||
qinco = QINCo(d=16, K=20, L=2, M=3, h=8)
|
||||
codes = torch.randint(0, 20, (10, 3))
|
||||
x_ref = qinco.decode(codes)
|
||||
|
||||
qinco2 = faiss.QINCo(qinco)
|
||||
codes2 = faiss.Int32Tensor2D(codes.to(dtype=torch.int32))
|
||||
x_new = qinco2.decode(codes2)
|
||||
|
||||
np.testing.assert_allclose(x_ref.numpy(), x_new.numpy(), atol=2e-6)
|
||||
|
||||
@torch.no_grad()
|
||||
def test_encode(self):
|
||||
torch.manual_seed(123)
|
||||
qinco = QINCo(d=16, K=20, L=2, M=3, h=8)
|
||||
codes = torch.randint(0, 20, (10, 3))
|
||||
x = qinco.decode(codes)
|
||||
del codes
|
||||
|
||||
ref_codes, _ = qinco.encode(x)
|
||||
|
||||
qinco2 = faiss.QINCo(qinco)
|
||||
x2 = faiss.Tensor2D(x)
|
||||
|
||||
new_codes = qinco2.encode(x2)
|
||||
|
||||
np.testing.assert_allclose(ref_codes.numpy(), new_codes.numpy(), atol=2e-6)
|
||||
|
||||
|
||||
######################################################
|
||||
# Test index
|
||||
######################################################
|
||||
|
||||
class TestIndexQINCo(unittest.TestCase):
|
||||
|
||||
def test_search(self):
|
||||
"""
|
||||
We can't train qinco with just Faiss so we just train a RQ and use the
|
||||
codebooks in QINCo with L = 0 residual blocks
|
||||
"""
|
||||
ds = datasets.SyntheticDataset(32, 1000, 100, 0)
|
||||
|
||||
# prepare reference quantizer
|
||||
M = 5
|
||||
index_ref = faiss.index_factory(ds.d, "RQ5x4")
|
||||
rq = index_ref.rq
|
||||
# rq = faiss.ResidualQuantizer(ds.d, M, 4)
|
||||
rq.train_type = faiss.ResidualQuantizer.Train_default
|
||||
rq.max_beam_size = 1 # beam search not implemented for QINCo (yet)
|
||||
index_ref.train(ds.get_train())
|
||||
codebooks = get_additive_quantizer_codebooks(rq)
|
||||
|
||||
# convert to QINCo index
|
||||
qinco_index = faiss.IndexQINCo(ds.d, M, 4, 0, ds.d)
|
||||
qinco = qinco_index.qinco
|
||||
qinco.codebook0.from_array(codebooks[0])
|
||||
for i in range(1, qinco.M):
|
||||
step = qinco.get_step(i - 1)
|
||||
step.codebook.from_array(codebooks[i])
|
||||
# MLPConcat left at zero -- it's added to the backbone
|
||||
qinco_index.is_trained = True
|
||||
|
||||
# verify that the encoding gives the same results
|
||||
ref_codes = rq.compute_codes(ds.get_database())
|
||||
ref_decoded = rq.decode(ref_codes)
|
||||
new_decoded = qinco_index.sa_decode(ref_codes)
|
||||
np.testing.assert_allclose(ref_decoded, new_decoded, atol=2e-6)
|
||||
|
||||
new_codes = qinco_index.sa_encode(ds.get_database())
|
||||
np.testing.assert_array_equal(ref_codes, new_codes)
|
||||
|
||||
# verify that search gives the same results
|
||||
Dref, Iref = index_ref.search(ds.get_queries(), 5)
|
||||
Dnew, Inew = qinco_index.search(ds.get_queries(), 5)
|
||||
|
||||
np.testing.assert_array_equal(Iref, Inew)
|
||||
np.testing.assert_allclose(Dref, Dnew, atol=2e-6)
|
||||
Reference in New Issue
Block a user