Initial commit
This commit is contained in:
25
packages/leann-backend-hnsw/third_party/faiss/demos/CMakeLists.txt
vendored
Normal file
25
packages/leann-backend-hnsw/third_party/faiss/demos/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
add_executable(demo_imi_flat EXCLUDE_FROM_ALL demo_imi_flat.cpp)
|
||||
target_link_libraries(demo_imi_flat PRIVATE faiss)
|
||||
|
||||
add_executable(demo_imi_pq EXCLUDE_FROM_ALL demo_imi_pq.cpp)
|
||||
target_link_libraries(demo_imi_pq PRIVATE faiss)
|
||||
|
||||
add_executable(demo_ivfpq_indexing EXCLUDE_FROM_ALL demo_ivfpq_indexing.cpp)
|
||||
target_link_libraries(demo_ivfpq_indexing PRIVATE faiss)
|
||||
|
||||
add_executable(demo_nndescent EXCLUDE_FROM_ALL demo_nndescent.cpp)
|
||||
target_link_libraries(demo_nndescent PRIVATE faiss)
|
||||
|
||||
add_executable(demo_sift1M EXCLUDE_FROM_ALL demo_sift1M.cpp)
|
||||
target_link_libraries(demo_sift1M PRIVATE faiss)
|
||||
|
||||
add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp)
|
||||
target_link_libraries(demo_weighted_kmeans PRIVATE faiss)
|
||||
|
||||
add_executable(demo_residual_quantizer EXCLUDE_FROM_ALL demo_residual_quantizer.cpp)
|
||||
target_link_libraries(demo_residual_quantizer PRIVATE faiss)
|
||||
28
packages/leann-backend-hnsw/third_party/faiss/demos/README.md
vendored
Normal file
28
packages/leann-backend-hnsw/third_party/faiss/demos/README.md
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
|
||||
Demos for a few Faiss functionalities
|
||||
=====================================
|
||||
|
||||
|
||||
demo_auto_tune.py
|
||||
-----------------
|
||||
|
||||
Demonstrates the auto-tuning functionality of Faiss
|
||||
|
||||
|
||||
demo_ondisk_ivf.py
|
||||
------------------
|
||||
|
||||
Shows how to construct a Faiss index that stores the inverted file
|
||||
data on disk, eg. when it does not fit in RAM. The script works on a
|
||||
small dataset (sift1M) for demonstration and proceeds in stages:
|
||||
|
||||
0: train on the dataset
|
||||
|
||||
1-4: build 4 indexes, each containing 1/4 of the dataset. This can be
|
||||
done in parallel on several machines
|
||||
|
||||
5: merge the 4 indexes into one that is written directly to disk
|
||||
(needs not to fit in RAM)
|
||||
|
||||
6: load and test the index
|
||||
169
packages/leann-backend-hnsw/third_party/faiss/demos/demo_auto_tune.py
vendored
Executable file
169
packages/leann-backend-hnsw/third_party/faiss/demos/demo_auto_tune.py
vendored
Executable file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python2
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
from matplotlib import pyplot
|
||||
graphical_output = True
|
||||
except ImportError:
|
||||
graphical_output = False
|
||||
|
||||
import faiss
|
||||
|
||||
#################################################################
|
||||
# Small I/O functions
|
||||
#################################################################
|
||||
|
||||
def ivecs_read(fname):
|
||||
a = np.fromfile(fname, dtype="int32")
|
||||
d = a[0]
|
||||
return a.reshape(-1, d + 1)[:, 1:].copy()
|
||||
|
||||
def fvecs_read(fname):
|
||||
return ivecs_read(fname).view('float32')
|
||||
|
||||
|
||||
def plot_OperatingPoints(ops, nq, **kwargs):
|
||||
ops = ops.optimal_pts
|
||||
n = ops.size() * 2 - 1
|
||||
pyplot.plot([ops.at( i // 2).perf for i in range(n)],
|
||||
[ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
|
||||
**kwargs)
|
||||
|
||||
|
||||
#################################################################
|
||||
# prepare common data for all indexes
|
||||
#################################################################
|
||||
|
||||
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
print("load data")
|
||||
|
||||
xt = fvecs_read("sift1M/sift_learn.fvecs")
|
||||
xb = fvecs_read("sift1M/sift_base.fvecs")
|
||||
xq = fvecs_read("sift1M/sift_query.fvecs")
|
||||
|
||||
d = xt.shape[1]
|
||||
|
||||
print("load GT")
|
||||
|
||||
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
|
||||
gt = gt.astype('int64')
|
||||
k = gt.shape[1]
|
||||
|
||||
print("prepare criterion")
|
||||
|
||||
# criterion = 1-recall at 1
|
||||
crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1)
|
||||
crit.set_groundtruth(None, gt)
|
||||
crit.nnn = k
|
||||
|
||||
# indexes that are useful when there is no limitation on memory usage
|
||||
unlimited_mem_keys = [
|
||||
"IMI2x10,Flat", "IMI2x11,Flat",
|
||||
"IVF4096,Flat", "IVF16384,Flat",
|
||||
"PCA64,IMI2x10,Flat"]
|
||||
|
||||
# memory limited to 16 bytes / vector
|
||||
keys_mem_16 = [
|
||||
'IMI2x10,PQ16', 'IVF4096,PQ16',
|
||||
'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16'
|
||||
]
|
||||
|
||||
# limited to 32 bytes / vector
|
||||
keys_mem_32 = [
|
||||
'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32',
|
||||
'IMI2x10,PQ16+16',
|
||||
'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16'
|
||||
]
|
||||
|
||||
# indexes that can run on the GPU
|
||||
keys_gpu = [
|
||||
"PCA64,IVF4096,Flat",
|
||||
"PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
|
||||
"IVF4096,PQ32"]
|
||||
|
||||
|
||||
keys_to_test = unlimited_mem_keys
|
||||
use_gpu = False
|
||||
|
||||
|
||||
if use_gpu:
|
||||
# if this fails, it means that the GPU version was not comp
|
||||
assert faiss.StandardGpuResources, \
|
||||
"Faiss was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
|
||||
res = faiss.StandardGpuResources()
|
||||
dev_no = 0
|
||||
|
||||
# remember results from other index types
|
||||
op_per_key = []
|
||||
|
||||
|
||||
# keep track of optimal operating points seen so far
|
||||
op = faiss.OperatingPoints()
|
||||
|
||||
|
||||
for index_key in keys_to_test:
|
||||
|
||||
print("============ key", index_key)
|
||||
|
||||
# make the index described by the key
|
||||
index = faiss.index_factory(d, index_key)
|
||||
|
||||
|
||||
if use_gpu:
|
||||
# transfer to GPU (may be partial)
|
||||
index = faiss.index_cpu_to_gpu(res, dev_no, index)
|
||||
params = faiss.GpuParameterSpace()
|
||||
else:
|
||||
params = faiss.ParameterSpace()
|
||||
|
||||
params.initialize(index)
|
||||
|
||||
print("[%.3f s] train & add" % (time.time() - t0))
|
||||
|
||||
index.train(xt)
|
||||
index.add(xb)
|
||||
|
||||
print("[%.3f s] explore op points" % (time.time() - t0))
|
||||
|
||||
# find operating points for this index
|
||||
opi = params.explore(index, xq, crit)
|
||||
|
||||
print("[%.3f s] result operating points:" % (time.time() - t0))
|
||||
opi.display()
|
||||
|
||||
# update best operating points so far
|
||||
op.merge_with(opi, index_key + " ")
|
||||
|
||||
op_per_key.append((index_key, opi))
|
||||
|
||||
if graphical_output:
|
||||
# graphical output (to tmp/ subdirectory)
|
||||
|
||||
fig = pyplot.figure(figsize=(12, 9))
|
||||
pyplot.xlabel("1-recall at 1")
|
||||
pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads())
|
||||
pyplot.gca().set_yscale('log')
|
||||
pyplot.grid()
|
||||
for i2, opi2 in op_per_key:
|
||||
plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o')
|
||||
# plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r')
|
||||
pyplot.legend(loc=2)
|
||||
fig.savefig('tmp/demo_auto_tune.png')
|
||||
|
||||
|
||||
print("[%.3f s] final result:" % (time.time() - t0))
|
||||
|
||||
op.display()
|
||||
91
packages/leann-backend-hnsw/third_party/faiss/demos/demo_client_server_ivf.py
vendored
Executable file
91
packages/leann-backend-hnsw/third_party/faiss/demos/demo_client_server_ivf.py
vendored
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
import faiss
|
||||
|
||||
from faiss.contrib.client_server import run_index_server, ClientIndex
|
||||
|
||||
|
||||
#################################################################
|
||||
# Small I/O functions
|
||||
#################################################################
|
||||
|
||||
|
||||
def ivecs_read(fname):
|
||||
a = np.fromfile(fname, dtype='int32')
|
||||
d = a[0]
|
||||
return a.reshape(-1, d + 1)[:, 1:].copy()
|
||||
|
||||
|
||||
def fvecs_read(fname):
|
||||
return ivecs_read(fname).view('float32')
|
||||
|
||||
|
||||
#################################################################
|
||||
# Main program
|
||||
#################################################################
|
||||
|
||||
stage = int(sys.argv[1])
|
||||
|
||||
tmpdir = '/tmp/'
|
||||
|
||||
if stage == 0:
|
||||
# train the index
|
||||
xt = fvecs_read("sift1M/sift_learn.fvecs")
|
||||
index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
|
||||
print("training index")
|
||||
index.train(xt)
|
||||
print("write " + tmpdir + "trained.index")
|
||||
faiss.write_index(index, tmpdir + "trained.index")
|
||||
|
||||
|
||||
if 1 <= stage <= 4:
|
||||
# add 1/4 of the database to 4 independent indexes
|
||||
bno = stage - 1
|
||||
xb = fvecs_read("sift1M/sift_base.fvecs")
|
||||
i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
|
||||
index = faiss.read_index(tmpdir + "trained.index")
|
||||
print("adding vectors %d:%d" % (i0, i1))
|
||||
index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
|
||||
print("write " + tmpdir + "block_%d.index" % bno)
|
||||
faiss.write_index(index, tmpdir + "block_%d.index" % bno)
|
||||
|
||||
|
||||
machine_ports = [
|
||||
('localhost', 12010),
|
||||
('localhost', 12011),
|
||||
('localhost', 12012),
|
||||
('localhost', 12013),
|
||||
]
|
||||
v6 = False
|
||||
|
||||
if 5 <= stage <= 8:
|
||||
# load an index slice and launch index
|
||||
bno = stage - 5
|
||||
|
||||
fname = tmpdir + "block_%d.index" % bno
|
||||
print("read " + fname)
|
||||
index = faiss.read_index(fname)
|
||||
|
||||
port = machine_ports[bno][1]
|
||||
run_index_server(index, port, v6=v6)
|
||||
|
||||
|
||||
if stage == 9:
|
||||
client_index = ClientIndex(machine_ports)
|
||||
print('index size:', client_index.ntotal)
|
||||
client_index.set_nprobe(16)
|
||||
|
||||
# load query vectors and ground-truth
|
||||
xq = fvecs_read("sift1M/sift_query.fvecs")
|
||||
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
|
||||
|
||||
D, I = client_index.search(xq, 5)
|
||||
|
||||
recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
|
||||
print("recall@1: %.3f" % recall_at_1)
|
||||
173
packages/leann-backend-hnsw/third_party/faiss/demos/demo_distributed_kmeans_torch.py
vendored
Normal file
173
packages/leann-backend-hnsw/third_party/faiss/demos/demo_distributed_kmeans_torch.py
vendored
Normal file
@@ -0,0 +1,173 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
import faiss
|
||||
|
||||
import faiss.contrib.torch_utils
|
||||
from faiss.contrib.torch import clustering
|
||||
from faiss.contrib import datasets
|
||||
|
||||
|
||||
class DatasetAssignDistributedGPU(clustering.DatasetAssign):
|
||||
"""
|
||||
There is one instance per worker, each worker has a dataset shard.
|
||||
The non-master workers do not run through the k-means function, so some
|
||||
code has run it to keep the workers in sync.
|
||||
"""
|
||||
|
||||
def __init__(self, res, x, rank, nproc):
|
||||
clustering.DatasetAssign.__init__(self, x)
|
||||
self.res = res
|
||||
self.rank = rank
|
||||
self.nproc = nproc
|
||||
self.device = x.device
|
||||
|
||||
n = len(x)
|
||||
sizes = torch.zeros(nproc, device=self.device, dtype=torch.int64)
|
||||
sizes[rank] = n
|
||||
torch.distributed.all_gather(
|
||||
[sizes[i:i + 1] for i in range(nproc)], sizes[rank:rank + 1])
|
||||
self.sizes = sizes.cpu().numpy()
|
||||
|
||||
# begin & end of each shard
|
||||
self.cs = np.zeros(nproc + 1, dtype='int64')
|
||||
self.cs[1:] = np.cumsum(self.sizes)
|
||||
|
||||
def count(self):
|
||||
return int(self.sizes.sum())
|
||||
|
||||
def int_to_slaves(self, i):
|
||||
" broadcast an int to all workers "
|
||||
rank = self.rank
|
||||
tab = torch.zeros(1, device=self.device, dtype=torch.int64)
|
||||
if rank == 0:
|
||||
tab[0] = i
|
||||
else:
|
||||
assert i is None
|
||||
torch.distributed.broadcast(tab, 0)
|
||||
return tab.item()
|
||||
|
||||
def get_subset(self, indices):
|
||||
rank = self.rank
|
||||
assert rank == 0 or indices is None
|
||||
|
||||
len_indices = self.int_to_slaves(len(indices) if rank == 0 else None)
|
||||
|
||||
if rank == 0:
|
||||
indices = torch.from_numpy(indices).to(self.device)
|
||||
else:
|
||||
indices = torch.zeros(
|
||||
len_indices, dtype=torch.int64, device=self.device)
|
||||
torch.distributed.broadcast(indices, 0)
|
||||
|
||||
# select subset of indices
|
||||
|
||||
i0, i1 = self.cs[rank], self.cs[rank + 1]
|
||||
|
||||
mask = torch.logical_and(indices < i1, indices >= i0)
|
||||
output = torch.zeros(
|
||||
len_indices, self.x.shape[1],
|
||||
dtype=self.x.dtype, device=self.device)
|
||||
output[mask] = self.x[indices[mask] - i0]
|
||||
torch.distributed.reduce(output, 0) # sum
|
||||
if rank == 0:
|
||||
return output
|
||||
else:
|
||||
return None
|
||||
|
||||
def perform_search(self, centroids):
|
||||
assert False, "shoudl not be called"
|
||||
|
||||
def assign_to(self, centroids, weights=None):
|
||||
assert weights is None
|
||||
|
||||
rank, nproc = self.rank, self.nproc
|
||||
assert rank == 0 or centroids is None
|
||||
nc = self.int_to_slaves(len(centroids) if rank == 0 else None)
|
||||
|
||||
if rank != 0:
|
||||
centroids = torch.zeros(
|
||||
nc, self.x.shape[1], dtype=self.x.dtype, device=self.device)
|
||||
torch.distributed.broadcast(centroids, 0)
|
||||
|
||||
# perform search
|
||||
D, I = faiss.knn_gpu(
|
||||
self.res, self.x, centroids, 1, device=self.device.index)
|
||||
|
||||
I = I.ravel()
|
||||
D = D.ravel()
|
||||
|
||||
sum_per_centroid = torch.zeros_like(centroids)
|
||||
if weights is None:
|
||||
sum_per_centroid.index_add_(0, I, self.x)
|
||||
else:
|
||||
sum_per_centroid.index_add_(0, I, self.x * weights[:, None])
|
||||
|
||||
torch.distributed.reduce(sum_per_centroid, 0)
|
||||
|
||||
if rank == 0:
|
||||
# gather deos not support tensors of different sizes
|
||||
# should be implemented with point-to-point communication
|
||||
assert np.all(self.sizes == self.sizes[0])
|
||||
device = self.device
|
||||
all_I = torch.zeros(self.count(), dtype=I.dtype, device=device)
|
||||
all_D = torch.zeros(self.count(), dtype=D.dtype, device=device)
|
||||
torch.distributed.gather(
|
||||
I, [all_I[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
|
||||
dst=0,
|
||||
)
|
||||
torch.distributed.gather(
|
||||
D, [all_D[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
|
||||
dst=0,
|
||||
)
|
||||
return all_I.cpu().numpy(), all_D, sum_per_centroid
|
||||
else:
|
||||
torch.distributed.gather(I, None, dst=0)
|
||||
torch.distributed.gather(D, None, dst=0)
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
)
|
||||
rank = torch.distributed.get_rank()
|
||||
nproc = torch.distributed.get_world_size()
|
||||
|
||||
# current version does only support shards of the same size
|
||||
ds = datasets.SyntheticDataset(32, 10000, 0, 0, seed=1234 + rank)
|
||||
x = ds.get_train()
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
|
||||
torch.cuda.set_device(device)
|
||||
x = torch.from_numpy(x).to(device)
|
||||
res = faiss.StandardGpuResources()
|
||||
|
||||
da = DatasetAssignDistributedGPU(res, x, rank, nproc)
|
||||
|
||||
k = 1000
|
||||
niter = 25
|
||||
|
||||
if rank == 0:
|
||||
print(f"sizes = {da.sizes}")
|
||||
centroids, iteration_stats = clustering.kmeans(
|
||||
k, da, niter=niter, return_stats=True)
|
||||
print("clusters:", centroids.cpu().numpy())
|
||||
else:
|
||||
# make sure the iterations are aligned with master
|
||||
da.get_subset(None)
|
||||
|
||||
for _ in range(niter):
|
||||
da.assign_to(None)
|
||||
|
||||
torch.distributed.barrier()
|
||||
print("Done")
|
||||
155
packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_flat.cpp
vendored
Normal file
155
packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_flat.cpp
vendored
Normal file
@@ -0,0 +1,155 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/IndexPQ.h>
|
||||
|
||||
double elapsed() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
int main() {
|
||||
double t0 = elapsed();
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 128;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 1000 * 1000;
|
||||
|
||||
// make a set of nt training vectors in the unit cube
|
||||
// (could be the database)
|
||||
size_t nt = 100 * 1000;
|
||||
|
||||
//---------------------------------------------------------------
|
||||
// Define the core quantizer
|
||||
// We choose a multiple inverted index for faster training with less data
|
||||
// and because it usually offers best accuracy/speed trade-offs
|
||||
//
|
||||
// We here assume that its lifespan of this coarse quantizer will cover the
|
||||
// lifespan of the inverted-file quantizer IndexIVFFlat below
|
||||
// With dynamic allocation, one may give the responsibility to free the
|
||||
// quantizer to the inverted-file index (with attribute do_delete_quantizer)
|
||||
//
|
||||
// Note: a regular clustering algorithm would be defined as:
|
||||
// faiss::IndexFlatL2 coarse_quantizer (d);
|
||||
//
|
||||
// Use nhash=2 subquantizers used to define the product coarse quantizer
|
||||
// Number of bits: we will have 2^nbits_coarse centroids per subquantizer
|
||||
// meaning (2^12)^nhash distinct inverted lists
|
||||
size_t nhash = 2;
|
||||
size_t nbits_subq = int(log2(nb + 1) / 2); // good choice in general
|
||||
size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
|
||||
|
||||
faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
|
||||
|
||||
printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
|
||||
nhash,
|
||||
nbits_subq,
|
||||
ncentroids,
|
||||
nb);
|
||||
|
||||
// the coarse quantizer should not be dealloced before the index
|
||||
// 4 = nb of bytes per code (d must be a multiple of this)
|
||||
// 8 = nb of bits per sub-code (almost always 8)
|
||||
faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
|
||||
faiss::IndexIVFFlat index(&coarse_quantizer, d, ncentroids, metric);
|
||||
index.quantizer_trains_alone = true;
|
||||
|
||||
// define the number of probes. 2048 is for high-dim, overkilled in practice
|
||||
// Use 4-1024 depending on the trade-off speed accuracy that you want
|
||||
index.nprobe = 2048;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
{ // training
|
||||
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
|
||||
elapsed() - t0,
|
||||
nt,
|
||||
d);
|
||||
|
||||
std::vector<float> trainvecs(nt * d);
|
||||
for (size_t i = 0; i < nt * d; i++) {
|
||||
trainvecs[i] = distrib(rng);
|
||||
}
|
||||
|
||||
printf("[%.3f s] Training the index\n", elapsed() - t0);
|
||||
index.verbose = true;
|
||||
index.train(nt, trainvecs.data());
|
||||
}
|
||||
|
||||
size_t nq;
|
||||
std::vector<float> queries;
|
||||
|
||||
{ // populating the database
|
||||
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
|
||||
elapsed() - t0,
|
||||
nb);
|
||||
|
||||
std::vector<float> database(nb * d);
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
|
||||
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
|
||||
|
||||
index.add(nb, database.data());
|
||||
|
||||
// remember a few elements from the database as queries
|
||||
int i0 = 1234;
|
||||
int i1 = 1244;
|
||||
|
||||
nq = i1 - i0;
|
||||
queries.resize(nq * d);
|
||||
for (int i = i0; i < i1; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
queries[(i - i0) * d + j] = database[i * d + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // searching the database
|
||||
int k = 5;
|
||||
printf("[%.3f s] Searching the %d nearest neighbors "
|
||||
"of %ld vectors in the index\n",
|
||||
elapsed() - t0,
|
||||
k,
|
||||
nq);
|
||||
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<float> dis(k * nq);
|
||||
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
|
||||
printf("[%.3f s] Query results (vector ids, then distances):\n",
|
||||
elapsed() - t0);
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
printf("query %2d: ", i);
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7ld ", nns[j + i * k]);
|
||||
}
|
||||
printf("\n dis: ");
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7g ", dis[j + i * k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
207
packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_pq.cpp
vendored
Normal file
207
packages/leann-backend-hnsw/third_party/faiss/demos/demo_imi_pq.cpp
vendored
Normal file
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFPQ.h>
|
||||
#include <faiss/IndexPQ.h>
|
||||
#include <faiss/index_io.h>
|
||||
|
||||
double elapsed() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
int main() {
|
||||
double t0 = elapsed();
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 1000 * 1000;
|
||||
size_t add_bs = 10000; // # size of the blocks to add
|
||||
|
||||
// make a set of nt training vectors in the unit cube
|
||||
// (could be the database)
|
||||
size_t nt = 100 * 1000;
|
||||
|
||||
//---------------------------------------------------------------
|
||||
// Define the core quantizer
|
||||
// We choose a multiple inverted index for faster training with less data
|
||||
// and because it usually offers best accuracy/speed trade-offs
|
||||
//
|
||||
// We here assume that its lifespan of this coarse quantizer will cover the
|
||||
// lifespan of the inverted-file quantizer IndexIVFFlat below
|
||||
// With dynamic allocation, one may give the responsibility to free the
|
||||
// quantizer to the inverted-file index (with attribute do_delete_quantizer)
|
||||
//
|
||||
// Note: a regular clustering algorithm would be defined as:
|
||||
// faiss::IndexFlatL2 coarse_quantizer (d);
|
||||
//
|
||||
// Use nhash=2 subquantizers used to define the product coarse quantizer
|
||||
// Number of bits: we will have 2^nbits_coarse centroids per subquantizer
|
||||
// meaning (2^12)^nhash distinct inverted lists
|
||||
//
|
||||
// The parameter bytes_per_code is determined by the memory
|
||||
// constraint, the dataset will use nb * (bytes_per_code + 8)
|
||||
// bytes.
|
||||
//
|
||||
// The parameter nbits_subq is determined by the size of the dataset to
|
||||
// index.
|
||||
//
|
||||
size_t nhash = 2;
|
||||
size_t nbits_subq = 9;
|
||||
size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
|
||||
int bytes_per_code = 16;
|
||||
|
||||
faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
|
||||
|
||||
printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
|
||||
nhash,
|
||||
nbits_subq,
|
||||
ncentroids,
|
||||
nb);
|
||||
|
||||
// the coarse quantizer should not be dealloced before the index
|
||||
// 4 = nb of bytes per code (d must be a multiple of this)
|
||||
// 8 = nb of bits per sub-code (almost always 8)
|
||||
faiss::IndexIVFPQ index(
|
||||
&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
|
||||
index.quantizer_trains_alone = true;
|
||||
|
||||
// define the number of probes. 2048 is for high-dim, overkill in practice
|
||||
// Use 4-1024 depending on the trade-off speed accuracy that you want
|
||||
index.nprobe = 2048;
|
||||
|
||||
std::mt19937 rng;
|
||||
std::uniform_real_distribution<> distrib;
|
||||
|
||||
{ // training.
|
||||
|
||||
// The distribution of the training vectors should be the same
|
||||
// as the database vectors. It could be a sub-sample of the
|
||||
// database vectors, if sampling is not biased. Here we just
|
||||
// randomly generate the vectors.
|
||||
|
||||
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
|
||||
elapsed() - t0,
|
||||
nt,
|
||||
d);
|
||||
|
||||
std::vector<float> trainvecs(nt * d);
|
||||
for (size_t i = 0; i < nt; i++) {
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
trainvecs[i * d + j] = distrib(rng);
|
||||
}
|
||||
}
|
||||
|
||||
printf("[%.3f s] Training the index\n", elapsed() - t0);
|
||||
index.verbose = true;
|
||||
index.train(nt, trainvecs.data());
|
||||
}
|
||||
|
||||
// the index can be re-loaded later with
|
||||
// faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
|
||||
faiss::write_index(&index, "/tmp/trained_index.faissindex");
|
||||
|
||||
size_t nq;
|
||||
std::vector<float> queries;
|
||||
|
||||
{ // populating the database
|
||||
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
|
||||
elapsed() - t0,
|
||||
nb);
|
||||
|
||||
std::vector<float> database(nb * d);
|
||||
std::vector<faiss::idx_t> ids(nb);
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
database[i * d + j] = distrib(rng);
|
||||
}
|
||||
ids[i] = 8760000000L + i;
|
||||
}
|
||||
|
||||
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
|
||||
|
||||
for (size_t begin = 0; begin < nb; begin += add_bs) {
|
||||
size_t end = std::min(begin + add_bs, nb);
|
||||
index.add_with_ids(
|
||||
end - begin,
|
||||
database.data() + d * begin,
|
||||
ids.data() + begin);
|
||||
}
|
||||
|
||||
// remember a few elements from the database as queries
|
||||
int i0 = 1234;
|
||||
int i1 = 1244;
|
||||
|
||||
nq = i1 - i0;
|
||||
queries.resize(nq * d);
|
||||
for (int i = i0; i < i1; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
queries[(i - i0) * d + j] = database[i * d + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A few notes on the internal format of the index:
|
||||
//
|
||||
// - the positing lists for PQ codes are index.codes, which is a
|
||||
// std::vector < std::vector<uint8_t> >
|
||||
// if n is the length of posting list #i, codes[i] has length
|
||||
// bytes_per_code * n
|
||||
//
|
||||
// - the corresponding ids are stored in index.ids
|
||||
//
|
||||
// - given a vector float *x, finding which k centroids are
|
||||
// closest to it (ie to find the nearest neighbors) can be done with
|
||||
//
|
||||
// faiss::idx_t *centroid_ids = new faiss::idx_t[k];
|
||||
// float *distances = new float[k];
|
||||
// index.quantizer->search (1, x, k, dis, centroids_ids);
|
||||
//
|
||||
|
||||
faiss::write_index(&index, "/tmp/populated_index.faissindex");
|
||||
|
||||
{ // searching the database
|
||||
int k = 5;
|
||||
printf("[%.3f s] Searching the %d nearest neighbors "
|
||||
"of %ld vectors in the index\n",
|
||||
elapsed() - t0,
|
||||
k,
|
||||
nq);
|
||||
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<float> dis(k * nq);
|
||||
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
|
||||
printf("[%.3f s] Query results (vector ids, then distances):\n",
|
||||
elapsed() - t0);
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
printf("query %2d: ", i);
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7ld ", nns[j + i * k]);
|
||||
}
|
||||
printf("\n dis: ");
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7g ", dis[j + i * k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
146
packages/leann-backend-hnsw/third_party/faiss/demos/demo_ivfpq_indexing.cpp
vendored
Normal file
146
packages/leann-backend-hnsw/third_party/faiss/demos/demo_ivfpq_indexing.cpp
vendored
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFPQ.h>
|
||||
#include <faiss/index_io.h>
|
||||
|
||||
double elapsed() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
int main() {
|
||||
double t0 = elapsed();
|
||||
|
||||
// dimension of the vectors to index
|
||||
int d = 128;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 200 * 1000;
|
||||
|
||||
// make a set of nt training vectors in the unit cube
|
||||
// (could be the database)
|
||||
size_t nt = 100 * 1000;
|
||||
|
||||
// make the index object and train it
|
||||
faiss::IndexFlatL2 coarse_quantizer(d);
|
||||
|
||||
// a reasonable number of centroids to index nb vectors
|
||||
int ncentroids = int(4 * sqrt(nb));
|
||||
|
||||
// the coarse quantizer should not be dealloced before the index
|
||||
// 4 = nb of bytes per code (d must be a multiple of this)
|
||||
// 8 = nb of bits per sub-code (almost always 8)
|
||||
faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 4, 8);
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
{ // training
|
||||
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
|
||||
elapsed() - t0,
|
||||
nt,
|
||||
d);
|
||||
|
||||
std::vector<float> trainvecs(nt * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < nt * d; i++) {
|
||||
trainvecs[i] = distrib(rng);
|
||||
}
|
||||
|
||||
printf("[%.3f s] Training the index\n", elapsed() - t0);
|
||||
index.verbose = true;
|
||||
|
||||
index.train(nt, trainvecs.data());
|
||||
}
|
||||
|
||||
{ // I/O demo
|
||||
const char* outfilename = "/tmp/index_trained.faissindex";
|
||||
printf("[%.3f s] storing the pre-trained index to %s\n",
|
||||
elapsed() - t0,
|
||||
outfilename);
|
||||
|
||||
write_index(&index, outfilename);
|
||||
}
|
||||
|
||||
size_t nq;
|
||||
std::vector<float> queries;
|
||||
|
||||
{ // populating the database
|
||||
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
|
||||
elapsed() - t0,
|
||||
nb);
|
||||
|
||||
std::vector<float> database(nb * d);
|
||||
std::uniform_real_distribution<> distrib;
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = distrib(rng);
|
||||
}
|
||||
|
||||
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
|
||||
|
||||
index.add(nb, database.data());
|
||||
|
||||
printf("[%.3f s] imbalance factor: %g\n",
|
||||
elapsed() - t0,
|
||||
index.invlists->imbalance_factor());
|
||||
|
||||
// remember a few elements from the database as queries
|
||||
int i0 = 1234;
|
||||
int i1 = 1243;
|
||||
|
||||
nq = i1 - i0;
|
||||
queries.resize(nq * d);
|
||||
for (int i = i0; i < i1; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
queries[(i - i0) * d + j] = database[i * d + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // searching the database
|
||||
int k = 5;
|
||||
printf("[%.3f s] Searching the %d nearest neighbors "
|
||||
"of %ld vectors in the index\n",
|
||||
elapsed() - t0,
|
||||
k,
|
||||
nq);
|
||||
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<float> dis(k * nq);
|
||||
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
|
||||
printf("[%.3f s] Query results (vector ids, then distances):\n",
|
||||
elapsed() - t0);
|
||||
|
||||
for (int i = 0; i < nq; i++) {
|
||||
printf("query %2d: ", i);
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7ld ", nns[j + i * k]);
|
||||
}
|
||||
printf("\n dis: ");
|
||||
for (int j = 0; j < k; j++) {
|
||||
printf("%7g ", dis[j + i * k]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("note that the nearest neighbor is not at "
|
||||
"distance 0 due to quantization errors\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
88
packages/leann-backend-hnsw/third_party/faiss/demos/demo_nndescent.cpp
vendored
Normal file
88
packages/leann-backend-hnsw/third_party/faiss/demos/demo_nndescent.cpp
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <random>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexNNDescent.h>
|
||||
|
||||
using namespace std::chrono;
|
||||
|
||||
int main(void) {
|
||||
// dimension of the vectors to index
|
||||
int d = 64;
|
||||
int K = 64;
|
||||
|
||||
// size of the database we plan to index
|
||||
size_t nb = 10000;
|
||||
|
||||
std::mt19937 rng(12345);
|
||||
|
||||
// make the index object and train it
|
||||
faiss::IndexNNDescentFlat index(d, K, faiss::METRIC_L2);
|
||||
index.nndescent.S = 10;
|
||||
index.nndescent.R = 32;
|
||||
index.nndescent.L = K;
|
||||
index.nndescent.iter = 10;
|
||||
index.verbose = true;
|
||||
|
||||
// generate labels by IndexFlat
|
||||
faiss::IndexFlat bruteforce(d, faiss::METRIC_L2);
|
||||
|
||||
std::vector<float> database(nb * d);
|
||||
for (size_t i = 0; i < nb * d; i++) {
|
||||
database[i] = rng() % 1024;
|
||||
}
|
||||
|
||||
{ // populating the database
|
||||
index.add(nb, database.data());
|
||||
bruteforce.add(nb, database.data());
|
||||
}
|
||||
|
||||
size_t nq = 1000;
|
||||
|
||||
{ // searching the database
|
||||
printf("Searching ...\n");
|
||||
index.nndescent.search_L = 50;
|
||||
|
||||
std::vector<float> queries(nq * d);
|
||||
for (size_t i = 0; i < nq * d; i++) {
|
||||
queries[i] = rng() % 1024;
|
||||
}
|
||||
|
||||
int k = 5;
|
||||
std::vector<faiss::idx_t> nns(k * nq);
|
||||
std::vector<faiss::idx_t> gt_nns(k * nq);
|
||||
std::vector<float> dis(k * nq);
|
||||
|
||||
auto start = high_resolution_clock::now();
|
||||
index.search(nq, queries.data(), k, dis.data(), nns.data());
|
||||
auto end = high_resolution_clock::now();
|
||||
|
||||
// find exact kNNs by brute force search
|
||||
bruteforce.search(nq, queries.data(), k, dis.data(), gt_nns.data());
|
||||
|
||||
int recalls = 0;
|
||||
for (size_t i = 0; i < nq; ++i) {
|
||||
for (int n = 0; n < k; n++) {
|
||||
for (int m = 0; m < k; m++) {
|
||||
if (nns[i * k + n] == gt_nns[i * k + m]) {
|
||||
recalls += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
float recall = 1.0f * recalls / (k * nq);
|
||||
auto t = duration_cast<microseconds>(end - start).count();
|
||||
int qps = nq * 1.0f * 1000 * 1000 / t;
|
||||
|
||||
printf("Recall@%d: %f, QPS: %d\n", k, recall, qps);
|
||||
}
|
||||
}
|
||||
86
packages/leann-backend-hnsw/third_party/faiss/demos/demo_ondisk_ivf.py
vendored
Executable file
86
packages/leann-backend-hnsw/third_party/faiss/demos/demo_ondisk_ivf.py
vendored
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
import faiss
|
||||
from faiss.contrib.ondisk import merge_ondisk
|
||||
|
||||
#################################################################
|
||||
# Small I/O functions
|
||||
#################################################################
|
||||
|
||||
|
||||
def ivecs_read(fname):
|
||||
a = np.fromfile(fname, dtype='int32')
|
||||
d = a[0]
|
||||
return a.reshape(-1, d + 1)[:, 1:].copy()
|
||||
|
||||
|
||||
def fvecs_read(fname):
|
||||
return ivecs_read(fname).view('float32')
|
||||
|
||||
|
||||
#################################################################
|
||||
# Main program
|
||||
#################################################################
|
||||
|
||||
stage = int(sys.argv[1])
|
||||
|
||||
tmpdir = '/tmp/'
|
||||
|
||||
if stage == 0:
|
||||
# train the index
|
||||
xt = fvecs_read("sift1M/sift_learn.fvecs")
|
||||
index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
|
||||
print("training index")
|
||||
index.train(xt)
|
||||
print("write " + tmpdir + "trained.index")
|
||||
faiss.write_index(index, tmpdir + "trained.index")
|
||||
|
||||
|
||||
if 1 <= stage <= 4:
|
||||
# add 1/4 of the database to 4 independent indexes
|
||||
bno = stage - 1
|
||||
xb = fvecs_read("sift1M/sift_base.fvecs")
|
||||
i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
|
||||
index = faiss.read_index(tmpdir + "trained.index")
|
||||
print("adding vectors %d:%d" % (i0, i1))
|
||||
index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
|
||||
print("write " + tmpdir + "block_%d.index" % bno)
|
||||
faiss.write_index(index, tmpdir + "block_%d.index" % bno)
|
||||
|
||||
if stage == 5:
|
||||
|
||||
print('loading trained index')
|
||||
# construct the output index
|
||||
index = faiss.read_index(tmpdir + "trained.index")
|
||||
|
||||
block_fnames = [
|
||||
tmpdir + "block_%d.index" % bno
|
||||
for bno in range(4)
|
||||
]
|
||||
|
||||
merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata")
|
||||
|
||||
print("write " + tmpdir + "populated.index")
|
||||
faiss.write_index(index, tmpdir + "populated.index")
|
||||
|
||||
|
||||
if stage == 6:
|
||||
# perform a search from disk
|
||||
print("read " + tmpdir + "populated.index")
|
||||
index = faiss.read_index(tmpdir + "populated.index")
|
||||
index.nprobe = 16
|
||||
|
||||
# load query vectors and ground-truth
|
||||
xq = fvecs_read("sift1M/sift_query.fvecs")
|
||||
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
|
||||
|
||||
D, I = index.search(xq, 5)
|
||||
|
||||
recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
|
||||
print("recall@1: %.3f" % recall_at_1)
|
||||
77
packages/leann-backend-hnsw/third_party/faiss/demos/demo_qinco.py
vendored
Normal file
77
packages/leann-backend-hnsw/third_party/faiss/demos/demo_qinco.py
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
"""
|
||||
This demonstrates how to reproduce the QINCo paper results using the Faiss
|
||||
QINCo implementation. The code loads the reference model because training
|
||||
is not implemented in Faiss.
|
||||
|
||||
Prepare the data with
|
||||
|
||||
cd /tmp
|
||||
|
||||
# get the reference qinco code
|
||||
git clone https://github.com/facebookresearch/Qinco.git
|
||||
|
||||
# get the data
|
||||
wget https://dl.fbaipublicfiles.com/QINCo/datasets/bigann/bigann1M.bvecs
|
||||
|
||||
# get the model
|
||||
wget https://dl.fbaipublicfiles.com/QINCo/models/bigann_8x8_L2.pt
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from faiss.contrib.vecs_io import bvecs_mmap
|
||||
import sys
|
||||
import time
|
||||
import torch
|
||||
import faiss
|
||||
|
||||
# make sure pickle deserialization will work
|
||||
sys.path.append("/tmp/Qinco")
|
||||
import model_qinco
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
qinco = torch.load("/tmp/bigann_8x8_L2.pt", weights_only=False)
|
||||
qinco.eval()
|
||||
# print(qinco)
|
||||
if True:
|
||||
torch.set_num_threads(1)
|
||||
faiss.omp_set_num_threads(1)
|
||||
|
||||
x_base = bvecs_mmap("/tmp/bigann1M.bvecs")[:1000].astype('float32')
|
||||
x_scaled = torch.from_numpy(x_base) / qinco.db_scale
|
||||
|
||||
t0 = time.time()
|
||||
codes, _ = qinco.encode(x_scaled)
|
||||
x_decoded_scaled = qinco.decode(codes)
|
||||
print(f"Pytorch encode {time.time() - t0:.3f} s")
|
||||
# multi-thread: 1.13s, single-thread: 7.744
|
||||
|
||||
x_decoded = x_decoded_scaled.numpy() * qinco.db_scale
|
||||
|
||||
err = ((x_decoded - x_base) ** 2).sum(1).mean()
|
||||
print("MSE=", err) # = 14211.956, near the L=2 result in Fig 4 of the paper
|
||||
|
||||
qinco2 = faiss.QINCo(qinco)
|
||||
t0 = time.time()
|
||||
codes2 = qinco2.encode(faiss.Tensor2D(x_scaled))
|
||||
x_decoded2 = qinco2.decode(codes2).numpy() * qinco.db_scale
|
||||
print(f"Faiss encode {time.time() - t0:.3f} s")
|
||||
# multi-thread: 3.2s, single thread: 7.019
|
||||
|
||||
# these tests don't work because there are outlier encodings
|
||||
# np.testing.assert_array_equal(codes.numpy(), codes2.numpy())
|
||||
# np.testing.assert_allclose(x_decoded, x_decoded2)
|
||||
|
||||
ndiff = (codes.numpy() != codes2.numpy()).sum() / codes.numel()
|
||||
assert ndiff < 0.01
|
||||
ndiff = (((x_decoded - x_decoded2) ** 2).sum(1) > 1e-5).sum()
|
||||
assert ndiff / len(x_base) < 0.01
|
||||
|
||||
err = ((x_decoded2 - x_base) ** 2).sum(1).mean()
|
||||
print("MSE=", err) # = 14213.551
|
||||
297
packages/leann-backend-hnsw/third_party/faiss/demos/demo_residual_quantizer.cpp
vendored
Normal file
297
packages/leann-backend-hnsw/third_party/faiss/demos/demo_residual_quantizer.cpp
vendored
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
|
||||
#include <faiss/IVFlib.h>
|
||||
#include <faiss/IndexAdditiveQuantizer.h>
|
||||
#include <faiss/IndexIVFAdditiveQuantizer.h>
|
||||
#include <faiss/MetricType.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include <faiss/utils/random.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
/* This demo file shows how to:
|
||||
* - use a DistanceComputer to compute distances with encoded vectors
|
||||
* - in the context of an IVF, how to split an additive quantizer into an
|
||||
* AdditiveCoarseQuantizer and a ResidualQuantizer, in two different ways, with
|
||||
* and without storing the prefix.
|
||||
*/
|
||||
|
||||
int main() {
|
||||
/******************************************
|
||||
* Generate a test dataset
|
||||
******************************************/
|
||||
using idx_t = faiss::idx_t;
|
||||
size_t d = 128;
|
||||
size_t nt = 10000;
|
||||
size_t nb = 10000;
|
||||
size_t nq = 100;
|
||||
double t0 = faiss::getmillisecs();
|
||||
|
||||
auto tic = [t0]() {
|
||||
printf("[%.3f s] ", (faiss::getmillisecs() - t0) / 1000);
|
||||
};
|
||||
|
||||
tic();
|
||||
printf("samping dataset of %zd dim vectors, Q %zd B %zd T %zd\n",
|
||||
d,
|
||||
nq,
|
||||
nb,
|
||||
nt);
|
||||
|
||||
std::vector<float> buf(d * (nq + nt + nb));
|
||||
faiss::rand_smooth_vectors(nq + nt + nb, d, buf.data(), 1234);
|
||||
const float* xt = buf.data();
|
||||
const float* xb = buf.data() + nt * d;
|
||||
const float* xq = buf.data() + (nt + nb) * d;
|
||||
|
||||
idx_t k = 10;
|
||||
std::vector<idx_t> gt(k * nq);
|
||||
std::vector<float> unused(k * nq);
|
||||
tic();
|
||||
printf("compute ground truth, k=%zd\n", k);
|
||||
faiss::knn_L2sqr(xq, xb, d, nq, nb, k, unused.data(), gt.data());
|
||||
|
||||
// a function to compute the accuracy
|
||||
auto accuracy = [&](const idx_t* I) {
|
||||
idx_t accu = 0;
|
||||
for (idx_t q = 0; q < nq; q++) {
|
||||
accu += faiss::ranklist_intersection_size(
|
||||
k, gt.data() + q * k, k, I + q * k);
|
||||
}
|
||||
return double(accu) / (k * nq);
|
||||
};
|
||||
|
||||
/******************************************
|
||||
* Prepare the residual quantizer
|
||||
******************************************/
|
||||
|
||||
faiss::ResidualQuantizer rq(
|
||||
d, 7, 6, faiss::AdditiveQuantizer::ST_norm_qint8);
|
||||
// do cheap an inaccurate training
|
||||
rq.cp.niter = 5;
|
||||
rq.max_beam_size = 5;
|
||||
rq.train_type = 0;
|
||||
tic();
|
||||
printf("training the residual quantizer beam_size=%d\n", rq.max_beam_size);
|
||||
rq.train(nt, xt);
|
||||
|
||||
tic();
|
||||
printf("encoding the database, code_size=%zd\n", rq.code_size);
|
||||
size_t code_size = rq.code_size;
|
||||
std::vector<uint8_t> raw_codes(nb * code_size);
|
||||
rq.compute_codes(xb, raw_codes.data(), nb);
|
||||
|
||||
/****************************************************************
|
||||
* Make an index that uses that residual quantizer
|
||||
* Verify that a distance computer gives the same distances
|
||||
****************************************************************/
|
||||
{
|
||||
faiss::IndexResidualQuantizer index(
|
||||
rq.d, rq.nbits, faiss::METRIC_L2, rq.search_type);
|
||||
|
||||
// override trained index
|
||||
index.rq = rq;
|
||||
index.is_trained = true;
|
||||
|
||||
// override vectors
|
||||
index.codes = faiss::MaybeOwnedVector<uint8_t>(raw_codes);
|
||||
index.ntotal = nb;
|
||||
|
||||
tic();
|
||||
printf("IndexResidualQuantizer ready, searching\n");
|
||||
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<idx_t> I(k * nq);
|
||||
index.search(nq, xq, k, D.data(), I.data());
|
||||
|
||||
tic();
|
||||
printf("Accuracy (intersection @ %zd): %.3f\n", k, accuracy(I.data()));
|
||||
std::unique_ptr<faiss::FlatCodesDistanceComputer> dc(
|
||||
index.get_FlatCodesDistanceComputer());
|
||||
|
||||
float max_diff12 = 0, max_diff13 = 0;
|
||||
|
||||
for (idx_t q = 0; q < nq; q++) {
|
||||
const float* query = xq + q * d;
|
||||
dc->set_query(query);
|
||||
for (int i = 0; i < k; i++) {
|
||||
// 3 ways of computing the same distance
|
||||
|
||||
// distance returned by the index
|
||||
float dis1 = D[q * k + i];
|
||||
|
||||
// distance returned by the DistanceComputer that accesses the
|
||||
// index
|
||||
idx_t db_index = I[q * k + i];
|
||||
float dis2 = (*dc)(db_index);
|
||||
|
||||
// distance computer from a code that does not belong to the
|
||||
// index
|
||||
const uint8_t* code = raw_codes.data() + code_size * db_index;
|
||||
float dis3 = dc->distance_to_code(code);
|
||||
|
||||
max_diff12 = std::max(std::abs(dis1 - dis2), max_diff12);
|
||||
max_diff13 = std::max(std::abs(dis1 - dis3), max_diff13);
|
||||
}
|
||||
}
|
||||
tic();
|
||||
printf("Max DistanceComputer discrepancy 1-2: %g 1-3: %g\n",
|
||||
max_diff12,
|
||||
max_diff13);
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
* Make an IVF index that uses the first 2 levels as a coarse quantizer
|
||||
* The IVF codes contain the full code (ie. redundant with the coarse
|
||||
*quantizer code)
|
||||
****************************************************************/
|
||||
{
|
||||
// build a coarse quantizer from the 2 first levels of the RQ
|
||||
std::vector<size_t> nbits(2);
|
||||
std::copy(rq.nbits.begin(), rq.nbits.begin() + 2, nbits.begin());
|
||||
faiss::ResidualCoarseQuantizer rcq(rq.d, nbits);
|
||||
|
||||
// set the coarse quantizer from the 2 first quantizers
|
||||
rcq.rq.initialize_from(rq);
|
||||
rcq.is_trained = true;
|
||||
rcq.ntotal = (idx_t)1 << rcq.rq.tot_bits;
|
||||
|
||||
// settings for exhaustive search in RCQ
|
||||
rcq.centroid_norms.resize(rcq.ntotal);
|
||||
rcq.aq->compute_centroid_norms(rcq.centroid_norms.data());
|
||||
rcq.beam_factor = -1.0; // use exact search
|
||||
size_t nlist = rcq.ntotal;
|
||||
tic();
|
||||
printf("RCQ nlist = %zd tot_bits=%zd\n", nlist, rcq.rq.tot_bits);
|
||||
|
||||
// build a IVFResidualQuantizer from that
|
||||
faiss::IndexIVFResidualQuantizer index(
|
||||
&rcq, rcq.d, nlist, rq.nbits, faiss::METRIC_L2, rq.search_type);
|
||||
index.by_residual = false;
|
||||
index.rq = rq;
|
||||
index.is_trained = true;
|
||||
|
||||
// there are 3 ways of filling up the index...
|
||||
for (std::string filled_with : {"add", "manual", "derived"}) {
|
||||
tic();
|
||||
printf("filling up the index with %s, code_size=%zd\n",
|
||||
filled_with.c_str(),
|
||||
index.code_size);
|
||||
|
||||
index.reset();
|
||||
|
||||
if (filled_with == "add") {
|
||||
// standard add method
|
||||
index.add(nb, xb);
|
||||
} else if (filled_with == "manual") {
|
||||
// compute inverted lists and add elements manually
|
||||
// fill in the inverted index manually
|
||||
faiss::InvertedLists& invlists = *index.invlists;
|
||||
|
||||
// assign vectors to inverted lists
|
||||
std::vector<idx_t> listnos(nb);
|
||||
std::vector<float> unused(nb);
|
||||
rcq.search(nb, xb, 1, unused.data(), listnos.data());
|
||||
|
||||
// populate inverted lists
|
||||
for (idx_t i = 0; i < nb; i++) {
|
||||
invlists.add_entry(
|
||||
listnos[i], i, &raw_codes[i * code_size]);
|
||||
}
|
||||
|
||||
index.ntotal = nb;
|
||||
} else if (filled_with == "derived") {
|
||||
// Since we have the raw codes precomputed, their prefix is the
|
||||
// inverted list index, so let's use that.
|
||||
faiss::InvertedLists& invlists = *index.invlists;
|
||||
|
||||
// populate inverted lists
|
||||
for (idx_t i = 0; i < nb; i++) {
|
||||
const uint8_t* code = &raw_codes[i * code_size];
|
||||
faiss::BitstringReader rd(code, code_size);
|
||||
idx_t list_no =
|
||||
rd.read(rcq.rq.tot_bits); // read the list number
|
||||
invlists.add_entry(list_no, i, code);
|
||||
}
|
||||
|
||||
index.ntotal = nb;
|
||||
}
|
||||
|
||||
tic();
|
||||
printf("Index filled in\n");
|
||||
|
||||
for (int nprobe : {1, 4, 16, 64, int(nlist)}) {
|
||||
printf("setting nprobe=%-4d", nprobe);
|
||||
|
||||
index.nprobe = nprobe;
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<idx_t> I(k * nq);
|
||||
index.search(nq, xq, k, D.data(), I.data());
|
||||
|
||||
tic();
|
||||
printf("Accuracy (intersection @ %zd): %.3f\n",
|
||||
k,
|
||||
accuracy(I.data()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
* Make an IVF index that uses the first 2 levels as a coarse
|
||||
* quantizer, but this time does not store the code prefix from the index
|
||||
****************************************************************/
|
||||
|
||||
{
|
||||
// build a coarse quantizer from the 2 first levels of the RQ
|
||||
int nlevel = 2;
|
||||
|
||||
std::unique_ptr<faiss::IndexIVFResidualQuantizer> index(
|
||||
faiss::ivflib::ivf_residual_from_quantizer(rq, nlevel));
|
||||
|
||||
// there are 2 ways of filling up the index...
|
||||
for (std::string filled_with : {"add", "derived"}) {
|
||||
tic();
|
||||
printf("filling up the IVF index with %s, code_size=%zd\n",
|
||||
filled_with.c_str(),
|
||||
index->code_size);
|
||||
|
||||
index->reset();
|
||||
|
||||
if (filled_with == "add") {
|
||||
// standard add method
|
||||
index->add(nb, xb);
|
||||
} else if (filled_with == "derived") {
|
||||
faiss::ivflib::ivf_residual_add_from_flat_codes(
|
||||
index.get(), nb, raw_codes.data(), rq.code_size);
|
||||
}
|
||||
|
||||
tic();
|
||||
printf("Index filled in\n");
|
||||
|
||||
for (int nprobe : {1, 4, 16, 64, int(index->nlist)}) {
|
||||
printf("setting nprobe=%-4d", nprobe);
|
||||
|
||||
index->nprobe = nprobe;
|
||||
std::vector<float> D(k * nq);
|
||||
std::vector<idx_t> I(k * nq);
|
||||
index->search(nq, xq, k, D.data(), I.data());
|
||||
|
||||
tic();
|
||||
printf("Accuracy (intersection @ %zd): %.3f\n",
|
||||
k,
|
||||
accuracy(I.data()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
254
packages/leann-backend-hnsw/third_party/faiss/demos/demo_sift1M.cpp
vendored
Normal file
254
packages/leann-backend-hnsw/third_party/faiss/demos/demo_sift1M.cpp
vendored
Normal file
@@ -0,0 +1,254 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <faiss/AutoTune.h>
|
||||
#include <faiss/index_factory.h>
|
||||
|
||||
/**
|
||||
* To run this demo, please download the ANN_SIFT1M dataset from
|
||||
*
|
||||
* http://corpus-texmex.irisa.fr/
|
||||
*
|
||||
* and unzip it to the sudirectory sift1M.
|
||||
**/
|
||||
|
||||
/*****************************************************
|
||||
* I/O functions for fvecs and ivecs
|
||||
*****************************************************/
|
||||
|
||||
float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
|
||||
FILE* f = fopen(fname, "r");
|
||||
if (!f) {
|
||||
fprintf(stderr, "could not open %s\n", fname);
|
||||
perror("");
|
||||
abort();
|
||||
}
|
||||
int d;
|
||||
fread(&d, 1, sizeof(int), f);
|
||||
assert((d > 0 && d < 1000000) || !"unreasonable dimension");
|
||||
fseek(f, 0, SEEK_SET);
|
||||
struct stat st;
|
||||
fstat(fileno(f), &st);
|
||||
size_t sz = st.st_size;
|
||||
assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
|
||||
size_t n = sz / ((d + 1) * 4);
|
||||
|
||||
*d_out = d;
|
||||
*n_out = n;
|
||||
float* x = new float[n * (d + 1)];
|
||||
size_t nr __attribute__((unused)) = fread(x, sizeof(float), n * (d + 1), f);
|
||||
assert(nr == n * (d + 1) || !"could not read whole file");
|
||||
|
||||
// shift array to remove row headers
|
||||
for (size_t i = 0; i < n; i++)
|
||||
memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
|
||||
|
||||
fclose(f);
|
||||
return x;
|
||||
}
|
||||
|
||||
// not very clean, but works as long as sizeof(int) == sizeof(float)
|
||||
int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
|
||||
return (int*)fvecs_read(fname, d_out, n_out);
|
||||
}
|
||||
|
||||
double elapsed() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
int main() {
|
||||
double t0 = elapsed();
|
||||
|
||||
// this is typically the fastest one.
|
||||
const char* index_key = "IVF4096,Flat";
|
||||
|
||||
// these ones have better memory usage
|
||||
// const char *index_key = "Flat";
|
||||
// const char *index_key = "PQ32";
|
||||
// const char *index_key = "PCA80,Flat";
|
||||
// const char *index_key = "IVF4096,PQ8+16";
|
||||
// const char *index_key = "IVF4096,PQ32";
|
||||
// const char *index_key = "IMI2x8,PQ32";
|
||||
// const char *index_key = "IMI2x8,PQ8+16";
|
||||
// const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
|
||||
|
||||
faiss::Index* index;
|
||||
|
||||
size_t d;
|
||||
|
||||
{
|
||||
printf("[%.3f s] Loading train set\n", elapsed() - t0);
|
||||
|
||||
size_t nt;
|
||||
float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
|
||||
|
||||
printf("[%.3f s] Preparing index \"%s\" d=%ld\n",
|
||||
elapsed() - t0,
|
||||
index_key,
|
||||
d);
|
||||
index = faiss::index_factory(d, index_key);
|
||||
|
||||
printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
|
||||
|
||||
index->train(nt, xt);
|
||||
delete[] xt;
|
||||
}
|
||||
|
||||
{
|
||||
printf("[%.3f s] Loading database\n", elapsed() - t0);
|
||||
|
||||
size_t nb, d2;
|
||||
float* xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
|
||||
assert(d == d2 || !"dataset does not have same dimension as train set");
|
||||
|
||||
printf("[%.3f s] Indexing database, size %ld*%ld\n",
|
||||
elapsed() - t0,
|
||||
nb,
|
||||
d);
|
||||
|
||||
index->add(nb, xb);
|
||||
|
||||
delete[] xb;
|
||||
}
|
||||
|
||||
size_t nq;
|
||||
float* xq;
|
||||
|
||||
{
|
||||
printf("[%.3f s] Loading queries\n", elapsed() - t0);
|
||||
|
||||
size_t d2;
|
||||
xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
|
||||
assert(d == d2 || !"query does not have same dimension as train set");
|
||||
}
|
||||
|
||||
size_t k; // nb of results per query in the GT
|
||||
faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
|
||||
|
||||
{
|
||||
printf("[%.3f s] Loading ground truth for %ld queries\n",
|
||||
elapsed() - t0,
|
||||
nq);
|
||||
|
||||
// load ground-truth and convert int to long
|
||||
size_t nq2;
|
||||
int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
|
||||
assert(nq2 == nq || !"incorrect nb of ground truth entries");
|
||||
|
||||
gt = new faiss::idx_t[k * nq];
|
||||
for (int i = 0; i < k * nq; i++) {
|
||||
gt[i] = gt_int[i];
|
||||
}
|
||||
delete[] gt_int;
|
||||
}
|
||||
|
||||
// Result of the auto-tuning
|
||||
std::string selected_params;
|
||||
|
||||
{ // run auto-tuning
|
||||
|
||||
printf("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
|
||||
"criterion, with k=%ld nq=%ld\n",
|
||||
elapsed() - t0,
|
||||
k,
|
||||
nq);
|
||||
|
||||
faiss::OneRecallAtRCriterion crit(nq, 1);
|
||||
crit.set_groundtruth(k, nullptr, gt);
|
||||
crit.nnn = k; // by default, the criterion will request only 1 NN
|
||||
|
||||
printf("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
|
||||
|
||||
faiss::ParameterSpace params;
|
||||
params.initialize(index);
|
||||
|
||||
printf("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
|
||||
elapsed() - t0,
|
||||
params.parameter_ranges.size(),
|
||||
params.n_combinations());
|
||||
|
||||
faiss::OperatingPoints ops;
|
||||
params.explore(index, nq, xq, crit, &ops);
|
||||
|
||||
printf("[%.3f s] Found the following operating points: \n",
|
||||
elapsed() - t0);
|
||||
|
||||
ops.display();
|
||||
|
||||
// keep the first parameter that obtains > 0.5 1-recall@1
|
||||
for (int i = 0; i < ops.optimal_pts.size(); i++) {
|
||||
if (ops.optimal_pts[i].perf > 0.5) {
|
||||
selected_params = ops.optimal_pts[i].key;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(selected_params.size() >= 0 ||
|
||||
!"could not find good enough op point");
|
||||
}
|
||||
|
||||
{ // Use the found configuration to perform a search
|
||||
|
||||
faiss::ParameterSpace params;
|
||||
|
||||
printf("[%.3f s] Setting parameter configuration \"%s\" on index\n",
|
||||
elapsed() - t0,
|
||||
selected_params.c_str());
|
||||
|
||||
params.set_index_parameters(index, selected_params.c_str());
|
||||
|
||||
printf("[%.3f s] Perform a search on %ld queries\n",
|
||||
elapsed() - t0,
|
||||
nq);
|
||||
|
||||
// output buffers
|
||||
faiss::idx_t* I = new faiss::idx_t[nq * k];
|
||||
float* D = new float[nq * k];
|
||||
|
||||
index->search(nq, xq, k, D, I);
|
||||
|
||||
printf("[%.3f s] Compute recalls\n", elapsed() - t0);
|
||||
|
||||
// evaluate result by hand.
|
||||
int n_1 = 0, n_10 = 0, n_100 = 0;
|
||||
for (int i = 0; i < nq; i++) {
|
||||
int gt_nn = gt[i * k];
|
||||
for (int j = 0; j < k; j++) {
|
||||
if (I[i * k + j] == gt_nn) {
|
||||
if (j < 1)
|
||||
n_1++;
|
||||
if (j < 10)
|
||||
n_10++;
|
||||
if (j < 100)
|
||||
n_100++;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("R@1 = %.4f\n", n_1 / float(nq));
|
||||
printf("R@10 = %.4f\n", n_10 / float(nq));
|
||||
printf("R@100 = %.4f\n", n_100 / float(nq));
|
||||
|
||||
delete[] I;
|
||||
delete[] D;
|
||||
}
|
||||
|
||||
delete[] xq;
|
||||
delete[] gt;
|
||||
delete index;
|
||||
return 0;
|
||||
}
|
||||
181
packages/leann-backend-hnsw/third_party/faiss/demos/demo_weighted_kmeans.cpp
vendored
Normal file
181
packages/leann-backend-hnsw/third_party/faiss/demos/demo_weighted_kmeans.cpp
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
|
||||
#include <faiss/Clustering.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexHNSW.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
namespace {
|
||||
|
||||
enum WeightedKMeansType {
|
||||
WKMT_FlatL2,
|
||||
WKMT_FlatIP,
|
||||
WKMT_FlatIP_spherical,
|
||||
WKMT_HNSW,
|
||||
};
|
||||
|
||||
float weighted_kmeans_clustering(
|
||||
size_t d,
|
||||
size_t n,
|
||||
size_t k,
|
||||
const float* input,
|
||||
const float* weights,
|
||||
float* centroids,
|
||||
WeightedKMeansType index_num) {
|
||||
using namespace faiss;
|
||||
Clustering clus(d, k);
|
||||
clus.verbose = true;
|
||||
|
||||
std::unique_ptr<Index> index;
|
||||
|
||||
switch (index_num) {
|
||||
case WKMT_FlatL2:
|
||||
index = std::make_unique<IndexFlatL2>(d);
|
||||
break;
|
||||
case WKMT_FlatIP:
|
||||
index = std::make_unique<IndexFlatIP>(d);
|
||||
break;
|
||||
case WKMT_FlatIP_spherical:
|
||||
index = std::make_unique<IndexFlatIP>(d);
|
||||
clus.spherical = true;
|
||||
break;
|
||||
case WKMT_HNSW:
|
||||
IndexHNSWFlat* ihnsw = new IndexHNSWFlat(d, 32);
|
||||
ihnsw->hnsw.efSearch = 128;
|
||||
index.reset(ihnsw);
|
||||
break;
|
||||
}
|
||||
|
||||
clus.train(n, input, *index.get(), weights);
|
||||
// on output the index contains the centroids.
|
||||
memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
|
||||
return clus.iteration_stats.back().obj;
|
||||
}
|
||||
|
||||
int d = 32;
|
||||
float sigma = 0.1;
|
||||
|
||||
#define BIGTEST
|
||||
|
||||
#ifdef BIGTEST
|
||||
// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
|
||||
int nc = 200000;
|
||||
int n_big = 4;
|
||||
int n_small = 2;
|
||||
#else
|
||||
int nc = 5;
|
||||
int n_big = 100;
|
||||
int n_small = 10;
|
||||
#endif
|
||||
|
||||
int n; // number of training points
|
||||
|
||||
void generate_trainset(
|
||||
std::vector<float>& ccent,
|
||||
std::vector<float>& x,
|
||||
std::vector<float>& weights) {
|
||||
// same sampling as test_build_blocks.py test_weighted
|
||||
|
||||
ccent.resize(d * 2 * nc);
|
||||
faiss::float_randn(ccent.data(), d * 2 * nc, 123);
|
||||
faiss::fvec_renorm_L2(d, 2 * nc, ccent.data());
|
||||
n = nc * n_big + nc * n_small;
|
||||
x.resize(d * n);
|
||||
weights.resize(n);
|
||||
faiss::float_randn(x.data(), x.size(), 1234);
|
||||
|
||||
float* xi = x.data();
|
||||
float* w = weights.data();
|
||||
for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
|
||||
int np = ci < nc ? n_big : n_small; // nb of points around this centroid
|
||||
for (int i = 0; i < np; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
xi[j] = xi[j] * sigma + ccent[ci * d + j];
|
||||
}
|
||||
*w++ = ci < nc ? 0.1 : 10;
|
||||
xi += d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::vector<float> ccent;
|
||||
std::vector<float> x;
|
||||
std::vector<float> weights;
|
||||
|
||||
printf("generate training set\n");
|
||||
generate_trainset(ccent, x, weights);
|
||||
|
||||
std::vector<float> centroids;
|
||||
centroids.resize(nc * d);
|
||||
|
||||
int the_index_num = -1;
|
||||
int the_with_weights = -1;
|
||||
|
||||
if (argc == 3) {
|
||||
the_index_num = atoi(argv[1]);
|
||||
the_with_weights = atoi(argv[2]);
|
||||
}
|
||||
|
||||
for (int index_num = WKMT_FlatL2; index_num <= WKMT_HNSW; index_num++) {
|
||||
if (the_index_num >= 0 && index_num != the_index_num) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int with_weights = 0; with_weights <= 1; with_weights++) {
|
||||
if (the_with_weights >= 0 && with_weights != the_with_weights) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("=================== index_num=%d Run %s weights\n",
|
||||
index_num,
|
||||
with_weights ? "with" : "without");
|
||||
|
||||
weighted_kmeans_clustering(
|
||||
d,
|
||||
n,
|
||||
nc,
|
||||
x.data(),
|
||||
with_weights ? weights.data() : nullptr,
|
||||
centroids.data(),
|
||||
(WeightedKMeansType)index_num);
|
||||
|
||||
{ // compute distance of points to centroids
|
||||
faiss::IndexFlatL2 cent_index(d);
|
||||
cent_index.add(nc, centroids.data());
|
||||
std::vector<float> dis(n);
|
||||
std::vector<faiss::idx_t> idx(n);
|
||||
|
||||
cent_index.search(
|
||||
nc * 2, ccent.data(), 1, dis.data(), idx.data());
|
||||
|
||||
float dis1 = 0, dis2 = 0;
|
||||
for (int i = 0; i < nc; i++) {
|
||||
dis1 += dis[i];
|
||||
}
|
||||
printf("average distance of points from big clusters: %g\n",
|
||||
dis1 / nc);
|
||||
|
||||
for (int i = 0; i < nc; i++) {
|
||||
dis2 += dis[i + nc];
|
||||
}
|
||||
|
||||
printf("average distance of points from small clusters: %g\n",
|
||||
dis2 / nc);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
303
packages/leann-backend-hnsw/third_party/faiss/demos/index_pq_flat_separate_codes_from_codebook.py
vendored
Normal file
303
packages/leann-backend-hnsw/third_party/faiss/demos/index_pq_flat_separate_codes_from_codebook.py
vendored
Normal file
@@ -0,0 +1,303 @@
|
||||
#!/usr/bin/env -S grimaldi --kernel bento_kernel_faiss
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
# fmt: off
|
||||
# flake8: noqa
|
||||
|
||||
|
||||
""":md
|
||||
# Serializing codes separately, with IndexLSH and IndexPQ
|
||||
|
||||
Let's say, for example, you have a few vector embeddings per user
|
||||
and want to shard a flat index by user so you can re-use the same LSH or PQ method
|
||||
for all users but store each user's codes independently.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
""":py"""
|
||||
import faiss
|
||||
import numpy as np
|
||||
|
||||
""":py"""
|
||||
d = 768
|
||||
n = 1_000
|
||||
ids = np.arange(n).astype('int64')
|
||||
training_data = np.random.rand(n, d).astype('float32')
|
||||
|
||||
""":py"""
|
||||
def read_ids_codes():
|
||||
try:
|
||||
return np.load("/tmp/ids.npy"), np.load("/tmp/codes.npy")
|
||||
except FileNotFoundError:
|
||||
return None, None
|
||||
|
||||
|
||||
def write_ids_codes(ids, codes):
|
||||
np.save("/tmp/ids.npy", ids)
|
||||
np.save("/tmp/codes.npy", codes.reshape(len(ids), -1))
|
||||
|
||||
|
||||
def write_template_index(template_index):
|
||||
faiss.write_index(template_index, "/tmp/template.index")
|
||||
|
||||
|
||||
def read_template_index_instance():
|
||||
return faiss.read_index("/tmp/template.index")
|
||||
|
||||
""":md
|
||||
## IndexLSH: separate codes
|
||||
|
||||
The first half of this notebook demonstrates how to store LSH codes. Unlike PQ, LSH does not require training. In fact, it's compression method, a random projections matrix, is deterministic on construction based on a random seed value that's [hardcoded](https://github.com/facebookresearch/faiss/blob/2c961cc308ade8a85b3aa10a550728ce3387f625/faiss/IndexLSH.cpp#L35).
|
||||
"""
|
||||
|
||||
""":py"""
|
||||
nbits = 1536
|
||||
|
||||
""":py"""
|
||||
# demonstrating encoding is deterministic
|
||||
|
||||
codes = []
|
||||
database_vector_float32 = np.random.rand(1, d).astype(np.float32)
|
||||
for i in range(10):
|
||||
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
|
||||
code = index.index.sa_encode(database_vector_float32)
|
||||
codes.append(code)
|
||||
|
||||
for i in range(1, 10):
|
||||
assert np.array_equal(codes[0], codes[i])
|
||||
|
||||
""":py"""
|
||||
# new database vector
|
||||
|
||||
ids, codes = read_ids_codes()
|
||||
database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
|
||||
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
|
||||
|
||||
code = index.index.sa_encode(database_vector_float32)
|
||||
|
||||
if ids is not None and codes is not None:
|
||||
ids = np.concatenate((ids, [database_vector_id]))
|
||||
codes = np.vstack((codes, code))
|
||||
else:
|
||||
ids = np.array([database_vector_id])
|
||||
codes = np.array([code])
|
||||
|
||||
write_ids_codes(ids, codes)
|
||||
|
||||
""":py '2840581589434841'"""
|
||||
# then at query time
|
||||
|
||||
query_vector_float32 = np.random.rand(1, d).astype(np.float32)
|
||||
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
|
||||
ids, codes = read_ids_codes()
|
||||
|
||||
index.add_sa_codes(codes, ids)
|
||||
|
||||
index.search(query_vector_float32, k=5)
|
||||
|
||||
""":py"""
|
||||
!rm /tmp/ids.npy /tmp/codes.npy
|
||||
|
||||
""":md
|
||||
## IndexPQ: separate codes from codebook
|
||||
|
||||
The second half of this notebook demonstrates how to separate serializing and deserializing the PQ codebook
|
||||
(via faiss.write_index for IndexPQ) independently of the vector codes. For example, in the case
|
||||
where you have a few vector embeddings per user and want to shard the flat index by user you
|
||||
can re-use the same PQ method for all users but store each user's codes independently.
|
||||
|
||||
"""
|
||||
|
||||
""":py"""
|
||||
M = d//8
|
||||
nbits = 8
|
||||
|
||||
""":py"""
|
||||
# at train time
|
||||
template_index = faiss.index_factory(d, f"IDMap2,PQ{M}x{nbits}")
|
||||
template_index.train(training_data)
|
||||
write_template_index(template_index)
|
||||
|
||||
""":py"""
|
||||
# New database vector
|
||||
|
||||
index = read_template_index_instance()
|
||||
ids, codes = read_ids_codes()
|
||||
database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
|
||||
|
||||
code = index.index.sa_encode(database_vector_float32)
|
||||
|
||||
if ids is not None and codes is not None:
|
||||
ids = np.concatenate((ids, [database_vector_id]))
|
||||
codes = np.vstack((codes, code))
|
||||
else:
|
||||
ids = np.array([database_vector_id])
|
||||
codes = np.array([code])
|
||||
|
||||
write_ids_codes(ids, codes)
|
||||
|
||||
""":py '1858280061369209'"""
|
||||
# then at query time
|
||||
query_vector_float32 = np.random.rand(1, d).astype(np.float32)
|
||||
id_wrapper_index = read_template_index_instance()
|
||||
ids, codes = read_ids_codes()
|
||||
|
||||
id_wrapper_index.add_sa_codes(codes, ids)
|
||||
|
||||
id_wrapper_index.search(query_vector_float32, k=5)
|
||||
|
||||
""":py"""
|
||||
!rm /tmp/ids.npy /tmp/codes.npy /tmp/template.index
|
||||
|
||||
""":md
|
||||
## Comparing these methods
|
||||
|
||||
- methods: Flat, LSH, PQ
|
||||
- vary cost: nbits, M for 1x, 2x, 4x, 8x, 16x, 32x compression
|
||||
- measure: recall@1
|
||||
|
||||
We don't measure latency as the number of vectors per user shard is insignificant.
|
||||
|
||||
"""
|
||||
|
||||
""":py '2898032417027201'"""
|
||||
n, d
|
||||
|
||||
""":py"""
|
||||
database_vector_ids, database_vector_float32s = np.arange(n), np.random.rand(n, d).astype(np.float32)
|
||||
query_vector_float32s = np.random.rand(n, d).astype(np.float32)
|
||||
|
||||
""":py"""
|
||||
index = faiss.index_factory(d, "IDMap2,Flat")
|
||||
index.add_with_ids(database_vector_float32s, database_vector_ids)
|
||||
_, ground_truth_result_ids= index.search(query_vector_float32s, k=1)
|
||||
|
||||
""":py '857475336204238'"""
|
||||
from dataclasses import dataclass
|
||||
|
||||
pq_m_nbits = (
|
||||
# 96 bytes
|
||||
(96, 8),
|
||||
(192, 4),
|
||||
# 192 bytes
|
||||
(192, 8),
|
||||
(384, 4),
|
||||
# 384 bytes
|
||||
(384, 8),
|
||||
(768, 4),
|
||||
)
|
||||
lsh_nbits = (768, 1536, 3072, 6144, 12288, 24576)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Record:
|
||||
type_: str
|
||||
index: faiss.Index
|
||||
args: tuple
|
||||
recall: float
|
||||
|
||||
|
||||
results = []
|
||||
|
||||
for m, nbits in pq_m_nbits:
|
||||
print("pq", m, nbits)
|
||||
index = faiss.index_factory(d, f"IDMap2,PQ{m}x{nbits}")
|
||||
index.train(training_data)
|
||||
index.add_with_ids(database_vector_float32s, database_vector_ids)
|
||||
_, result_ids = index.search(query_vector_float32s, k=1)
|
||||
recall = sum(result_ids == ground_truth_result_ids)
|
||||
results.append(Record("pq", index, (m, nbits), recall))
|
||||
|
||||
for nbits in lsh_nbits:
|
||||
print("lsh", nbits)
|
||||
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
|
||||
index.add_with_ids(database_vector_float32s, database_vector_ids)
|
||||
_, result_ids = index.search(query_vector_float32s, k=1)
|
||||
recall = sum(result_ids == ground_truth_result_ids)
|
||||
results.append(Record("lsh", index, (nbits,), recall))
|
||||
|
||||
""":py '556918346720794'"""
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
def create_grouped_bar_chart(x_values, y_values_list, labels_list, xlabel, ylabel, title):
|
||||
num_bars_per_group = len(x_values)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
for x, y_values, labels in zip(x_values, y_values_list, labels_list):
|
||||
num_bars = len(y_values)
|
||||
bar_width = 0.08 * x
|
||||
bar_positions = np.arange(num_bars) * bar_width - (num_bars - 1) * bar_width / 2 + x
|
||||
|
||||
bars = plt.bar(bar_positions, y_values, width=bar_width)
|
||||
|
||||
for bar, label in zip(bars, labels):
|
||||
height = bar.get_height()
|
||||
plt.annotate(
|
||||
label,
|
||||
xy=(bar.get_x() + bar.get_width() / 2, height),
|
||||
xytext=(0, 3),
|
||||
textcoords="offset points",
|
||||
ha='center', va='bottom'
|
||||
)
|
||||
|
||||
plt.xscale('log')
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(title)
|
||||
plt.xticks(x_values, labels=[str(x) for x in x_values])
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
# # Example usage:
|
||||
# x_values = [1, 2, 4, 8, 16, 32]
|
||||
# y_values_list = [
|
||||
# [2.5, 3.6, 1.8],
|
||||
# [3.0, 2.8],
|
||||
# [2.5, 3.5, 4.0, 1.0],
|
||||
# [4.2],
|
||||
# [3.0, 5.5, 2.2],
|
||||
# [6.0, 4.5]
|
||||
# ]
|
||||
# labels_list = [
|
||||
# ['A1', 'B1', 'C1'],
|
||||
# ['A2', 'B2'],
|
||||
# ['A3', 'B3', 'C3', 'D3'],
|
||||
# ['A4'],
|
||||
# ['A5', 'B5', 'C5'],
|
||||
# ['A6', 'B6']
|
||||
# ]
|
||||
|
||||
# create_grouped_bar_chart(x_values, y_values_list, labels_list, "x axis", "y axis", "title")
|
||||
|
||||
""":py '1630106834206134'"""
|
||||
# x-axis: compression ratio
|
||||
# y-axis: recall@1
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
x = defaultdict(list)
|
||||
x[1].append(("flat", 1.00))
|
||||
for r in results:
|
||||
y_value = r.recall[0] / n
|
||||
x_value = int(d * 4 / r.index.sa_code_size())
|
||||
label = None
|
||||
if r.type_ == "pq":
|
||||
label = f"PQ{r.args[0]}x{r.args[1]}"
|
||||
if r.type_ == "lsh":
|
||||
label = f"LSH{r.args[0]}"
|
||||
x[x_value].append((label, y_value))
|
||||
|
||||
x_values = sorted(list(x.keys()))
|
||||
create_grouped_bar_chart(
|
||||
x_values,
|
||||
[[e[1] for e in x[x_value]] for x_value in x_values],
|
||||
[[e[0] for e in x[x_value]] for x_value in x_values],
|
||||
"compression ratio",
|
||||
"recall@1 q=1,000 queries",
|
||||
"recall@1 for a database of n=1,000 d=768 vectors",
|
||||
)
|
||||
52
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/README.md
vendored
Normal file
52
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/README.md
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
|
||||
# Offline IVF
|
||||
|
||||
This folder contains the code for the offline ivf algorithm powered by faiss big batch search.
|
||||
|
||||
Create a conda env:
|
||||
|
||||
`conda create --name oivf python=3.10`
|
||||
|
||||
`conda activate oivf`
|
||||
|
||||
`conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4`
|
||||
|
||||
`conda install tqdm`
|
||||
|
||||
`conda install pyyaml`
|
||||
|
||||
`conda install -c conda-forge submitit`
|
||||
|
||||
|
||||
## Run book
|
||||
|
||||
1. Optionally shard your dataset (see create_sharded_dataset.py) and create the corresponding yaml file `config_ssnpp.yaml`. You can use `generate_config.py` by specifying the root directory of your dataset and the files with the data shards
|
||||
|
||||
`python generate_config`
|
||||
|
||||
2. Run the train index command
|
||||
|
||||
`python run.py --command train_index --config config_ssnpp.yaml --xb ssnpp_1B`
|
||||
|
||||
|
||||
3. Run the index-shard command so it produces sharded indexes, required for the search step
|
||||
|
||||
`python run.py --command index_shard --config config_ssnpp.yaml --xb ssnpp_1B`
|
||||
|
||||
|
||||
6. Send jobs to the cluster to run search
|
||||
|
||||
`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --cluster_run --partition <PARTITION-NAME>`
|
||||
|
||||
|
||||
Remarks about the `search` command: it is assumed that the database vectors are the query vectors when performing the search step.
|
||||
a. If the query vectors are different than the database vectors, it should be passed in the xq argument
|
||||
b. A new dataset needs to be prepared (step 1) before passing it to the query vectors argument `–xq`
|
||||
|
||||
`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --xq <QUERIES_DATASET_NAME>`
|
||||
|
||||
|
||||
6. We can always run the consistency-check for sanity checks!
|
||||
|
||||
`python run.py --command consistency_check--config config_ssnpp.yaml --xb ssnpp_1B`
|
||||
|
||||
0
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/__init__.py
vendored
Normal file
0
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/__init__.py
vendored
Normal file
110
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/config_ssnpp.yaml
vendored
Normal file
110
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/config_ssnpp.yaml
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
d: 256
|
||||
output: /checkpoint/marialomeli/offline_faiss/ssnpp
|
||||
index:
|
||||
prod:
|
||||
- 'IVF8192,PQ128'
|
||||
non-prod:
|
||||
- 'IVF16384,PQ128'
|
||||
- 'IVF32768,PQ128'
|
||||
- 'OPQ64_128,IVF4096,PQ64'
|
||||
nprobe:
|
||||
prod:
|
||||
- 512
|
||||
non-prod:
|
||||
- 256
|
||||
- 128
|
||||
- 1024
|
||||
- 2048
|
||||
- 4096
|
||||
- 8192
|
||||
|
||||
k: 50
|
||||
index_shard_size: 50000000
|
||||
query_batch_size: 50000000
|
||||
evaluation_sample: 10000
|
||||
training_sample: 1572864
|
||||
datasets:
|
||||
ssnpp_1B:
|
||||
root: /checkpoint/marialomeli/ssnpp_data
|
||||
size: 1000000000
|
||||
files:
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000000.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000001.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000002.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000003.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000004.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000005.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000006.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000007.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000008.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000009.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000010.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000011.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000012.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000013.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000014.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000015.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000016.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000017.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000018.npy
|
||||
size: 50000000
|
||||
- dtype: uint8
|
||||
format: npy
|
||||
name: ssnpp_0000000019.npy
|
||||
size: 50000000
|
||||
64
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/create_sharded_ssnpp_files.py
vendored
Normal file
64
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/create_sharded_ssnpp_files.py
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import numpy as np
|
||||
import argparse
|
||||
import os
|
||||
|
||||
|
||||
def xbin_mmap(fname, dtype, maxn=-1):
|
||||
"""
|
||||
Code from
|
||||
https://github.com/harsha-simhadri/big-ann-benchmarks/blob/main/benchmark/dataset_io.py#L94
|
||||
mmap the competition file format for a given type of items
|
||||
"""
|
||||
n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
|
||||
assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
|
||||
if maxn > 0:
|
||||
n = min(n, maxn)
|
||||
return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
ssnpp_data = xbin_mmap(fname=args.filepath, dtype="uint8")
|
||||
num_batches = ssnpp_data.shape[0] // args.data_batch
|
||||
assert (
|
||||
ssnpp_data.shape[0] % args.data_batch == 0
|
||||
), "num of embeddings per file should divide total num of embeddings"
|
||||
for i in range(num_batches):
|
||||
xb_batch = ssnpp_data[
|
||||
i * args.data_batch:(i + 1) * args.data_batch, :
|
||||
]
|
||||
filename = args.output_dir + f"/ssnpp_{(i):010}.npy"
|
||||
np.save(filename, xb_batch)
|
||||
print(f"File {filename} is saved!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--data_batch",
|
||||
dest="data_batch",
|
||||
type=int,
|
||||
default=50000000,
|
||||
help="Number of embeddings per file, should be a divisor of 1B",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--filepath",
|
||||
dest="filepath",
|
||||
type=str,
|
||||
default="/datasets01/big-ann-challenge-data/FB_ssnpp/FB_ssnpp_database.u8bin",
|
||||
help="path of 1B ssnpp database vectors' original file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--filepath",
|
||||
dest="output_dir",
|
||||
type=str,
|
||||
default="/checkpoint/marialomeli/ssnpp_data",
|
||||
help="path to put sharded files",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
174
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/dataset.py
vendored
Normal file
174
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/dataset.py
vendored
Normal file
@@ -0,0 +1,174 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import faiss
|
||||
from typing import List
|
||||
import random
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
def create_dataset_from_oivf_config(cfg, ds_name):
|
||||
normalise = cfg["normalise"] if "normalise" in cfg else False
|
||||
return MultiFileVectorDataset(
|
||||
cfg["datasets"][ds_name]["root"],
|
||||
[
|
||||
FileDescriptor(
|
||||
f["name"], f["format"], np.dtype(f["dtype"]), f["size"]
|
||||
)
|
||||
for f in cfg["datasets"][ds_name]["files"]
|
||||
],
|
||||
cfg["d"],
|
||||
normalise,
|
||||
cfg["datasets"][ds_name]["size"],
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=100)
|
||||
def _memmap_vecs(
|
||||
file_name: str, format: str, dtype: np.dtype, size: int, d: int
|
||||
) -> np.array:
|
||||
"""
|
||||
If the file is in raw format, the file size will
|
||||
be divisible by the dimensionality and by the size
|
||||
of the data type.
|
||||
Otherwise,the file contains a header and we assume
|
||||
it is of .npy type. It the returns the memmapped file.
|
||||
"""
|
||||
|
||||
assert os.path.exists(file_name), f"file does not exist {file_name}"
|
||||
if format == "raw":
|
||||
fl = os.path.getsize(file_name)
|
||||
nb = fl // d // dtype.itemsize
|
||||
assert nb == size, f"{nb} is different than config's {size}"
|
||||
assert fl == d * dtype.itemsize * nb # no header
|
||||
return np.memmap(file_name, shape=(nb, d), dtype=dtype, mode="r")
|
||||
elif format == "npy":
|
||||
vecs = np.load(file_name, mmap_mode="r")
|
||||
assert vecs.shape[0] == size, f"size:{size},shape {vecs.shape[0]}"
|
||||
assert vecs.shape[1] == d
|
||||
assert vecs.dtype == dtype
|
||||
return vecs
|
||||
else:
|
||||
ValueError("The file cannot be loaded in the current format.")
|
||||
|
||||
|
||||
class FileDescriptor:
|
||||
def __init__(self, name: str, format: str, dtype: np.dtype, size: int):
|
||||
self.name = name
|
||||
self.format = format
|
||||
self.dtype = dtype
|
||||
self.size = size
|
||||
|
||||
|
||||
class MultiFileVectorDataset:
|
||||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
file_descriptors: List[FileDescriptor],
|
||||
d: int,
|
||||
normalize: bool,
|
||||
size: int,
|
||||
):
|
||||
assert os.path.exists(root)
|
||||
self.root = root
|
||||
self.file_descriptors = file_descriptors
|
||||
self.d = d
|
||||
self.normalize = normalize
|
||||
self.size = size
|
||||
self.file_offsets = [0]
|
||||
t = 0
|
||||
for f in self.file_descriptors:
|
||||
xb = _memmap_vecs(
|
||||
f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
|
||||
)
|
||||
t += xb.shape[0]
|
||||
self.file_offsets.append(t)
|
||||
assert (
|
||||
t == self.size
|
||||
), "the sum of num of embeddings per file!=total num of embeddings"
|
||||
|
||||
def iterate(self, start: int, batch_size: int, dt: np.dtype):
|
||||
buffer = np.empty(shape=(batch_size, self.d), dtype=dt)
|
||||
rem = 0
|
||||
for f in self.file_descriptors:
|
||||
if start >= f.size:
|
||||
start -= f.size
|
||||
continue
|
||||
logging.info(f"processing: {f.name}...")
|
||||
xb = _memmap_vecs(
|
||||
f"{self.root}/{f.name}",
|
||||
f.format,
|
||||
f.dtype,
|
||||
f.size,
|
||||
self.d,
|
||||
)
|
||||
if start > 0:
|
||||
xb = xb[start:]
|
||||
start = 0
|
||||
req = min(batch_size - rem, xb.shape[0])
|
||||
buffer[rem:rem + req] = xb[:req]
|
||||
rem += req
|
||||
if rem == batch_size:
|
||||
if self.normalize:
|
||||
faiss.normalize_L2(buffer)
|
||||
yield buffer.copy()
|
||||
rem = 0
|
||||
for i in range(req, xb.shape[0], batch_size):
|
||||
j = i + batch_size
|
||||
if j <= xb.shape[0]:
|
||||
tmp = xb[i:j].astype(dt)
|
||||
if self.normalize:
|
||||
faiss.normalize_L2(tmp)
|
||||
yield tmp
|
||||
else:
|
||||
rem = xb.shape[0] - i
|
||||
buffer[:rem] = xb[i:j]
|
||||
if rem > 0:
|
||||
tmp = buffer[:rem]
|
||||
if self.normalize:
|
||||
faiss.normalize_L2(tmp)
|
||||
yield tmp
|
||||
|
||||
def get(self, idx: List[int]):
|
||||
n = len(idx)
|
||||
fidx = np.searchsorted(self.file_offsets, idx, "right")
|
||||
res = np.empty(shape=(len(idx), self.d), dtype=np.float32)
|
||||
for r, id, fid in zip(range(n), idx, fidx):
|
||||
assert fid > 0 and fid <= len(self.file_descriptors), f"{fid}"
|
||||
f = self.file_descriptors[fid - 1]
|
||||
# deferring normalization until after reading the vec
|
||||
vecs = _memmap_vecs(
|
||||
f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
|
||||
)
|
||||
i = id - self.file_offsets[fid - 1]
|
||||
assert i >= 0 and i < vecs.shape[0]
|
||||
res[r, :] = vecs[i] # TODO: find a faster way
|
||||
if self.normalize:
|
||||
faiss.normalize_L2(res)
|
||||
return res
|
||||
|
||||
def sample(self, n, idx_fn, vecs_fn):
|
||||
if vecs_fn and os.path.exists(vecs_fn):
|
||||
vecs = np.load(vecs_fn)
|
||||
assert vecs.shape == (n, self.d)
|
||||
return vecs
|
||||
if idx_fn and os.path.exists(idx_fn):
|
||||
idx = np.load(idx_fn)
|
||||
assert idx.size == n
|
||||
else:
|
||||
idx = np.array(sorted(random.sample(range(self.size), n)))
|
||||
if idx_fn:
|
||||
np.save(idx_fn, idx)
|
||||
vecs = self.get(idx)
|
||||
if vecs_fn:
|
||||
np.save(vecs_fn, vecs)
|
||||
return vecs
|
||||
|
||||
def get_first_n(self, n, dt):
|
||||
assert n <= self.size
|
||||
return next(self.iterate(0, n, dt))
|
||||
46
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/generate_config.py
vendored
Normal file
46
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/generate_config.py
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
import yaml
|
||||
|
||||
# with ssnpp sharded data
|
||||
root = "/checkpoint/marialomeli/ssnpp_data"
|
||||
file_names = [f"ssnpp_{i:010}.npy" for i in range(20)]
|
||||
d = 256
|
||||
dt = np.dtype(np.uint8)
|
||||
|
||||
|
||||
def read_embeddings(fp):
|
||||
fl = os.path.getsize(fp)
|
||||
nb = fl // d // dt.itemsize
|
||||
print(nb)
|
||||
if fl == d * dt.itemsize * nb: # no header
|
||||
return ("raw", np.memmap(fp, shape=(nb, d), dtype=dt, mode="r"))
|
||||
else: # assume npy
|
||||
vecs = np.load(fp, mmap_mode="r")
|
||||
assert vecs.shape[1] == d
|
||||
assert vecs.dtype == dt
|
||||
return ("npy", vecs)
|
||||
|
||||
|
||||
cfg = {}
|
||||
files = []
|
||||
size = 0
|
||||
for fn in file_names:
|
||||
fp = f"{root}/{fn}"
|
||||
assert os.path.exists(fp), f"{fp} is missing"
|
||||
ft, xb = read_embeddings(fp)
|
||||
files.append(
|
||||
{"name": fn, "size": xb.shape[0], "dtype": dt.name, "format": ft}
|
||||
)
|
||||
size += xb.shape[0]
|
||||
|
||||
cfg["size"] = size
|
||||
cfg["root"] = root
|
||||
cfg["d"] = d
|
||||
cfg["files"] = files
|
||||
print(yaml.dump(cfg))
|
||||
891
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/offline_ivf.py
vendored
Normal file
891
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/offline_ivf.py
vendored
Normal file
@@ -0,0 +1,891 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
import os
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
import logging
|
||||
from faiss.contrib.ondisk import merge_ondisk
|
||||
from faiss.contrib.big_batch_search import big_batch_search
|
||||
from faiss.contrib.exhaustive_search import knn_ground_truth
|
||||
from faiss.contrib.evaluation import knn_intersection_measure
|
||||
from utils import (
|
||||
get_intersection_cardinality_frequencies,
|
||||
margin,
|
||||
is_pretransform_index,
|
||||
)
|
||||
from dataset import create_dataset_from_oivf_config
|
||||
|
||||
logging.basicConfig(
|
||||
format=(
|
||||
"%(asctime)s.%(msecs)03d %(levelname)-8s %(threadName)-12s %(message)s"
|
||||
),
|
||||
level=logging.INFO,
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
force=True,
|
||||
)
|
||||
|
||||
EMBEDDINGS_BATCH_SIZE: int = 100_000
|
||||
NUM_SUBSAMPLES: int = 100
|
||||
SMALL_DATA_SAMPLE: int = 10000
|
||||
|
||||
|
||||
class OfflineIVF:
|
||||
def __init__(self, cfg, args, nprobe, index_factory_str):
|
||||
self.input_d = cfg["d"]
|
||||
self.dt = cfg["datasets"][args.xb]["files"][0]["dtype"]
|
||||
assert self.input_d > 0
|
||||
output_dir = cfg["output"]
|
||||
assert os.path.exists(output_dir)
|
||||
self.index_factory = index_factory_str
|
||||
assert self.index_factory is not None
|
||||
self.index_factory_fn = self.index_factory.replace(",", "_")
|
||||
self.index_template_file = (
|
||||
f"{output_dir}/{args.xb}/{self.index_factory_fn}.empty.faissindex"
|
||||
)
|
||||
logging.info(f"index template: {self.index_template_file}")
|
||||
|
||||
if not args.xq:
|
||||
args.xq = args.xb
|
||||
|
||||
self.by_residual = True
|
||||
if args.no_residuals:
|
||||
self.by_residual = False
|
||||
|
||||
xb_output_dir = f"{output_dir}/{args.xb}"
|
||||
if not os.path.exists(xb_output_dir):
|
||||
os.makedirs(xb_output_dir)
|
||||
xq_output_dir = f"{output_dir}/{args.xq}"
|
||||
if not os.path.exists(xq_output_dir):
|
||||
os.makedirs(xq_output_dir)
|
||||
search_output_dir = f"{output_dir}/{args.xq}_in_{args.xb}"
|
||||
if not os.path.exists(search_output_dir):
|
||||
os.makedirs(search_output_dir)
|
||||
self.knn_dir = f"{search_output_dir}/knn"
|
||||
if not os.path.exists(self.knn_dir):
|
||||
os.makedirs(self.knn_dir)
|
||||
self.eval_dir = f"{search_output_dir}/eval"
|
||||
if not os.path.exists(self.eval_dir):
|
||||
os.makedirs(self.eval_dir)
|
||||
self.index = {} # to keep a reference to opened indices,
|
||||
self.ivls = {} # hstack inverted lists,
|
||||
self.index_shards = {} # and index shards
|
||||
self.index_shard_prefix = (
|
||||
f"{xb_output_dir}/{self.index_factory_fn}.shard_"
|
||||
)
|
||||
self.xq_index_shard_prefix = (
|
||||
f"{xq_output_dir}/{self.index_factory_fn}.shard_"
|
||||
)
|
||||
self.index_file = ( # TODO: added back temporarily for evaluate, handle name of non-sharded index file and remove.
|
||||
f"{xb_output_dir}/{self.index_factory_fn}.faissindex"
|
||||
)
|
||||
self.xq_index_file = (
|
||||
f"{xq_output_dir}/{self.index_factory_fn}.faissindex"
|
||||
)
|
||||
self.training_sample = cfg["training_sample"]
|
||||
self.evaluation_sample = cfg["evaluation_sample"]
|
||||
self.xq_ds = create_dataset_from_oivf_config(cfg, args.xq)
|
||||
self.xb_ds = create_dataset_from_oivf_config(cfg, args.xb)
|
||||
file_descriptors = self.xq_ds.file_descriptors
|
||||
self.file_sizes = [fd.size for fd in file_descriptors]
|
||||
self.shard_size = cfg["index_shard_size"] # ~100GB
|
||||
self.nshards = self.xb_ds.size // self.shard_size
|
||||
if self.xb_ds.size % self.shard_size != 0:
|
||||
self.nshards += 1
|
||||
self.xq_nshards = self.xq_ds.size // self.shard_size
|
||||
if self.xq_ds.size % self.shard_size != 0:
|
||||
self.xq_nshards += 1
|
||||
self.nprobe = nprobe
|
||||
assert self.nprobe > 0, "Invalid nprobe parameter."
|
||||
if "deduper" in cfg:
|
||||
self.deduper = cfg["deduper"]
|
||||
self.deduper_codec_fn = [
|
||||
f"{xb_output_dir}/deduper_codec_{codec.replace(',', '_')}"
|
||||
for codec in self.deduper
|
||||
]
|
||||
self.deduper_idx_fn = [
|
||||
f"{xb_output_dir}/deduper_idx_{codec.replace(',', '_')}"
|
||||
for codec in self.deduper
|
||||
]
|
||||
else:
|
||||
self.deduper = None
|
||||
self.k = cfg["k"]
|
||||
assert self.k > 0, "Invalid number of neighbours parameter."
|
||||
self.knn_output_file_suffix = (
|
||||
f"{self.index_factory_fn}_np{self.nprobe}.npy"
|
||||
)
|
||||
|
||||
fp = 32
|
||||
if self.dt == "float16":
|
||||
fp = 16
|
||||
|
||||
self.xq_bs = cfg["query_batch_size"]
|
||||
if "metric" in cfg:
|
||||
self.metric = eval(f'faiss.{cfg["metric"]}')
|
||||
else:
|
||||
self.metric = faiss.METRIC_L2
|
||||
|
||||
if "evaluate_by_margin" in cfg:
|
||||
self.evaluate_by_margin = cfg["evaluate_by_margin"]
|
||||
else:
|
||||
self.evaluate_by_margin = False
|
||||
|
||||
os.system("grep -m1 'model name' < /proc/cpuinfo")
|
||||
os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
|
||||
os.system("nvidia-smi")
|
||||
os.system("nvcc --version")
|
||||
|
||||
self.knn_queries_memory_limit = 4 * 1024 * 1024 * 1024 # 4 GB
|
||||
self.knn_vectors_memory_limit = 8 * 1024 * 1024 * 1024 # 8 GB
|
||||
|
||||
def input_stats(self):
|
||||
"""
|
||||
Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
|
||||
"""
|
||||
xb_sample = self.xb_ds.get_first_n(self.training_sample, np.float32)
|
||||
logging.info(f"input shape: {xb_sample.shape}")
|
||||
logging.info("running MatrixStats on training sample...")
|
||||
logging.info(faiss.MatrixStats(xb_sample).comments)
|
||||
logging.info("done")
|
||||
|
||||
def dedupe(self):
|
||||
logging.info(self.deduper)
|
||||
if self.deduper is None:
|
||||
logging.info("No deduper configured")
|
||||
return
|
||||
codecs = []
|
||||
codesets = []
|
||||
idxs = []
|
||||
for factory, filename in zip(self.deduper, self.deduper_codec_fn):
|
||||
if os.path.exists(filename):
|
||||
logging.info(f"loading trained dedupe codec: {filename}")
|
||||
codec = faiss.read_index(filename)
|
||||
else:
|
||||
logging.info(f"training dedupe codec: {factory}")
|
||||
codec = faiss.index_factory(self.input_d, factory)
|
||||
xb_sample = np.unique(
|
||||
self.xb_ds.get_first_n(100_000, np.float32), axis=0
|
||||
)
|
||||
faiss.ParameterSpace().set_index_parameter(codec, "verbose", 1)
|
||||
codec.train(xb_sample)
|
||||
logging.info(f"writing trained dedupe codec: {filename}")
|
||||
faiss.write_index(codec, filename)
|
||||
codecs.append(codec)
|
||||
codesets.append(faiss.CodeSet(codec.sa_code_size()))
|
||||
idxs.append(np.empty((0,), dtype=np.uint32))
|
||||
bs = 1_000_000
|
||||
i = 0
|
||||
for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
|
||||
for j in range(len(codecs)):
|
||||
codec, codeset, idx = codecs[j], codesets[j], idxs[j]
|
||||
uniq = codeset.insert(codec.sa_encode(buffer))
|
||||
idxs[j] = np.append(
|
||||
idx,
|
||||
np.arange(i, i + buffer.shape[0], dtype=np.uint32)[uniq],
|
||||
)
|
||||
i += buffer.shape[0]
|
||||
for idx, filename in zip(idxs, self.deduper_idx_fn):
|
||||
logging.info(f"writing {filename}, shape: {idx.shape}")
|
||||
np.save(filename, idx)
|
||||
logging.info("done")
|
||||
|
||||
def train_index(self):
|
||||
"""
|
||||
Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
|
||||
"""
|
||||
assert not os.path.exists(self.index_template_file), (
|
||||
"The train command has been ran, the index template file already"
|
||||
" exists."
|
||||
)
|
||||
xb_sample = np.unique(
|
||||
self.xb_ds.get_first_n(self.training_sample, np.float32), axis=0
|
||||
)
|
||||
logging.info(f"input shape: {xb_sample.shape}")
|
||||
index = faiss.index_factory(
|
||||
self.input_d, self.index_factory, self.metric
|
||||
)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
index_ivf.by_residual = True
|
||||
faiss.ParameterSpace().set_index_parameter(index, "verbose", 1)
|
||||
logging.info("running training...")
|
||||
index.train(xb_sample)
|
||||
logging.info(f"writing trained index {self.index_template_file}...")
|
||||
faiss.write_index(index, self.index_template_file)
|
||||
logging.info("done")
|
||||
|
||||
def _iterate_transformed(self, ds, start, batch_size, dt):
|
||||
assert os.path.exists(self.index_template_file)
|
||||
index = faiss.read_index(self.index_template_file)
|
||||
if is_pretransform_index(index):
|
||||
vt = index.chain.at(0) # fetch pretransform
|
||||
for buffer in ds.iterate(start, batch_size, dt):
|
||||
yield vt.apply(buffer)
|
||||
else:
|
||||
for buffer in ds.iterate(start, batch_size, dt):
|
||||
yield buffer
|
||||
|
||||
def index_shard(self):
|
||||
assert os.path.exists(self.index_template_file)
|
||||
index = faiss.read_index(self.index_template_file)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
assert self.nprobe <= index_ivf.quantizer.ntotal, (
|
||||
f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
|
||||
f" to retrieve {self.nprobe} neighbours, check."
|
||||
)
|
||||
cpu_quantizer = index_ivf.quantizer
|
||||
gpu_quantizer = faiss.index_cpu_to_all_gpus(cpu_quantizer)
|
||||
|
||||
for i in range(0, self.nshards):
|
||||
sfn = f"{self.index_shard_prefix}{i}"
|
||||
try:
|
||||
index.reset()
|
||||
index_ivf.quantizer = gpu_quantizer
|
||||
with open(sfn, "xb"):
|
||||
start = i * self.shard_size
|
||||
jj = 0
|
||||
embeddings_batch_size = min(
|
||||
EMBEDDINGS_BATCH_SIZE, self.shard_size
|
||||
)
|
||||
assert (
|
||||
self.shard_size % embeddings_batch_size == 0
|
||||
or EMBEDDINGS_BATCH_SIZE % embeddings_batch_size == 0
|
||||
), (
|
||||
f"the shard size {self.shard_size} and embeddings"
|
||||
f" shard size {EMBEDDINGS_BATCH_SIZE} are not"
|
||||
" divisible"
|
||||
)
|
||||
|
||||
for xb_j in tqdm(
|
||||
self._iterate_transformed(
|
||||
self.xb_ds,
|
||||
start,
|
||||
embeddings_batch_size,
|
||||
np.float32,
|
||||
),
|
||||
file=sys.stdout,
|
||||
):
|
||||
if is_pretransform_index(index):
|
||||
assert xb_j.shape[1] == index.chain.at(0).d_out
|
||||
index_ivf.add_with_ids(
|
||||
xb_j,
|
||||
np.arange(start + jj, start + jj + xb_j.shape[0]),
|
||||
)
|
||||
else:
|
||||
assert xb_j.shape[1] == index.d
|
||||
index.add_with_ids(
|
||||
xb_j,
|
||||
np.arange(start + jj, start + jj + xb_j.shape[0]),
|
||||
)
|
||||
jj += xb_j.shape[0]
|
||||
logging.info(jj)
|
||||
assert (
|
||||
jj <= self.shard_size
|
||||
), f"jj {jj} and shard_zide {self.shard_size}"
|
||||
if jj == self.shard_size:
|
||||
break
|
||||
logging.info(f"writing {sfn}...")
|
||||
index_ivf.quantizer = cpu_quantizer
|
||||
faiss.write_index(index, sfn)
|
||||
except FileExistsError:
|
||||
logging.info(f"skipping shard: {i}")
|
||||
continue
|
||||
logging.info("done")
|
||||
|
||||
def merge_index(self):
|
||||
ivf_file = f"{self.index_file}.ivfdata"
|
||||
|
||||
assert os.path.exists(self.index_template_file)
|
||||
assert not os.path.exists(
|
||||
ivf_file
|
||||
), f"file with embeddings data {ivf_file} not found, check."
|
||||
assert not os.path.exists(self.index_file)
|
||||
index = faiss.read_index(self.index_template_file)
|
||||
block_fnames = [
|
||||
f"{self.index_shard_prefix}{i}" for i in range(self.nshards)
|
||||
]
|
||||
for fn in block_fnames:
|
||||
assert os.path.exists(fn)
|
||||
logging.info(block_fnames)
|
||||
logging.info("merging...")
|
||||
merge_ondisk(index, block_fnames, ivf_file)
|
||||
logging.info("writing index...")
|
||||
faiss.write_index(index, self.index_file)
|
||||
logging.info("done")
|
||||
|
||||
def _cached_search(
|
||||
self,
|
||||
sample,
|
||||
xq_ds,
|
||||
xb_ds,
|
||||
idx_file,
|
||||
vecs_file,
|
||||
I_file,
|
||||
D_file,
|
||||
index_file=None,
|
||||
nprobe=None,
|
||||
):
|
||||
if not os.path.exists(I_file):
|
||||
assert not os.path.exists(I_file), f"file {I_file} does not exist "
|
||||
assert not os.path.exists(D_file), f"file {D_file} does not exist "
|
||||
xq = xq_ds.sample(sample, idx_file, vecs_file)
|
||||
|
||||
if index_file:
|
||||
D, I = self._index_nonsharded_search(index_file, xq, nprobe)
|
||||
else:
|
||||
logging.info("ground truth computations")
|
||||
db_iterator = xb_ds.iterate(0, 100_000, np.float32)
|
||||
D, I = knn_ground_truth(
|
||||
xq, db_iterator, self.k, metric_type=self.metric
|
||||
)
|
||||
assert np.amin(I) >= 0
|
||||
|
||||
np.save(I_file, I)
|
||||
np.save(D_file, D)
|
||||
else:
|
||||
assert os.path.exists(idx_file), f"file {idx_file} does not exist "
|
||||
assert os.path.exists(
|
||||
vecs_file
|
||||
), f"file {vecs_file} does not exist "
|
||||
assert os.path.exists(I_file), f"file {I_file} does not exist "
|
||||
assert os.path.exists(D_file), f"file {D_file} does not exist "
|
||||
I = np.load(I_file)
|
||||
D = np.load(D_file)
|
||||
assert I.shape == (sample, self.k), f"{I_file} shape mismatch"
|
||||
assert D.shape == (sample, self.k), f"{D_file} shape mismatch"
|
||||
return (D, I)
|
||||
|
||||
def _index_search(self, index_shard_prefix, xq, nprobe):
|
||||
assert nprobe is not None
|
||||
logging.info(
|
||||
f"open sharded index: {index_shard_prefix}, {self.nshards}"
|
||||
)
|
||||
index = self._open_sharded_index(index_shard_prefix)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
logging.info(f"setting nprobe to {nprobe}")
|
||||
index_ivf.nprobe = nprobe
|
||||
return index.search(xq, self.k)
|
||||
|
||||
def _index_nonsharded_search(self, index_file, xq, nprobe):
|
||||
assert nprobe is not None
|
||||
logging.info(f"index {index_file}")
|
||||
assert os.path.exists(index_file), f"file {index_file} does not exist "
|
||||
index = faiss.read_index(index_file, faiss.IO_FLAG_ONDISK_SAME_DIR)
|
||||
logging.info(f"index size {index.ntotal} ")
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
logging.info(f"setting nprobe to {nprobe}")
|
||||
index_ivf.nprobe = nprobe
|
||||
return index.search(xq, self.k)
|
||||
|
||||
def _refine_distances(self, xq_ds, idx, xb_ds, I):
|
||||
xq = xq_ds.get(idx).repeat(self.k, axis=0)
|
||||
xb = xb_ds.get(I.reshape(-1))
|
||||
if self.metric == faiss.METRIC_INNER_PRODUCT:
|
||||
return (xq * xb).sum(axis=1).reshape(I.shape)
|
||||
elif self.metric == faiss.METRIC_L2:
|
||||
return ((xq - xb) ** 2).sum(axis=1).reshape(I.shape)
|
||||
else:
|
||||
raise ValueError(f"metric not supported {self.metric}")
|
||||
|
||||
def evaluate(self):
|
||||
self._evaluate(
|
||||
self.index_factory_fn,
|
||||
self.index_file,
|
||||
self.xq_index_file,
|
||||
self.nprobe,
|
||||
)
|
||||
|
||||
def _evaluate(self, index_factory_fn, index_file, xq_index_file, nprobe):
|
||||
idx_a_file = f"{self.eval_dir}/idx_a.npy"
|
||||
idx_b_gt_file = f"{self.eval_dir}/idx_b_gt.npy"
|
||||
idx_b_ann_file = (
|
||||
f"{self.eval_dir}/idx_b_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
vecs_a_file = f"{self.eval_dir}/vecs_a.npy"
|
||||
vecs_b_gt_file = f"{self.eval_dir}/vecs_b_gt.npy"
|
||||
vecs_b_ann_file = (
|
||||
f"{self.eval_dir}/vecs_b_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
D_a_gt_file = f"{self.eval_dir}/D_a_gt.npy"
|
||||
D_a_ann_file = (
|
||||
f"{self.eval_dir}/D_a_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
D_a_ann_refined_file = f"{self.eval_dir}/D_a_ann_refined_{index_factory_fn}_np{nprobe}.npy"
|
||||
D_b_gt_file = f"{self.eval_dir}/D_b_gt.npy"
|
||||
D_b_ann_file = (
|
||||
f"{self.eval_dir}/D_b_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
D_b_ann_gt_file = (
|
||||
f"{self.eval_dir}/D_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
I_a_gt_file = f"{self.eval_dir}/I_a_gt.npy"
|
||||
I_a_ann_file = (
|
||||
f"{self.eval_dir}/I_a_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
I_b_gt_file = f"{self.eval_dir}/I_b_gt.npy"
|
||||
I_b_ann_file = (
|
||||
f"{self.eval_dir}/I_b_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
I_b_ann_gt_file = (
|
||||
f"{self.eval_dir}/I_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
margin_gt_file = f"{self.eval_dir}/margin_gt.npy"
|
||||
margin_refined_file = (
|
||||
f"{self.eval_dir}/margin_refined_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
margin_ann_file = (
|
||||
f"{self.eval_dir}/margin_ann_{index_factory_fn}_np{nprobe}.npy"
|
||||
)
|
||||
|
||||
logging.info("exact search forward")
|
||||
# xq -> xb AKA a -> b
|
||||
D_a_gt, I_a_gt = self._cached_search(
|
||||
self.evaluation_sample,
|
||||
self.xq_ds,
|
||||
self.xb_ds,
|
||||
idx_a_file,
|
||||
vecs_a_file,
|
||||
I_a_gt_file,
|
||||
D_a_gt_file,
|
||||
)
|
||||
idx_a = np.load(idx_a_file)
|
||||
|
||||
logging.info("approximate search forward")
|
||||
D_a_ann, I_a_ann = self._cached_search(
|
||||
self.evaluation_sample,
|
||||
self.xq_ds,
|
||||
self.xb_ds,
|
||||
idx_a_file,
|
||||
vecs_a_file,
|
||||
I_a_ann_file,
|
||||
D_a_ann_file,
|
||||
index_file,
|
||||
nprobe,
|
||||
)
|
||||
|
||||
logging.info(
|
||||
"calculate refined distances on approximate search forward"
|
||||
)
|
||||
if os.path.exists(D_a_ann_refined_file):
|
||||
D_a_ann_refined = np.load(D_a_ann_refined_file)
|
||||
assert D_a_ann.shape == D_a_ann_refined.shape
|
||||
else:
|
||||
D_a_ann_refined = self._refine_distances(
|
||||
self.xq_ds, idx_a, self.xb_ds, I_a_ann
|
||||
)
|
||||
np.save(D_a_ann_refined_file, D_a_ann_refined)
|
||||
|
||||
if self.evaluate_by_margin:
|
||||
k_extract = self.k
|
||||
margin_threshold = 1.05
|
||||
logging.info(
|
||||
"exact search backward from the k_extract NN results of"
|
||||
" forward search"
|
||||
)
|
||||
# xb -> xq AKA b -> a
|
||||
D_a_b_gt = D_a_gt[:, :k_extract].ravel()
|
||||
idx_b_gt = I_a_gt[:, :k_extract].ravel()
|
||||
assert len(idx_b_gt) == self.evaluation_sample * k_extract
|
||||
np.save(idx_b_gt_file, idx_b_gt)
|
||||
# exact search
|
||||
D_b_gt, _ = self._cached_search(
|
||||
len(idx_b_gt),
|
||||
self.xb_ds,
|
||||
self.xq_ds,
|
||||
idx_b_gt_file,
|
||||
vecs_b_gt_file,
|
||||
I_b_gt_file,
|
||||
D_b_gt_file,
|
||||
) # xb and xq ^^^ are inverted
|
||||
|
||||
logging.info("margin on exact search")
|
||||
margin_gt = margin(
|
||||
self.evaluation_sample,
|
||||
idx_a,
|
||||
idx_b_gt,
|
||||
D_a_b_gt,
|
||||
D_a_gt,
|
||||
D_b_gt,
|
||||
self.k,
|
||||
k_extract,
|
||||
margin_threshold,
|
||||
)
|
||||
np.save(margin_gt_file, margin_gt)
|
||||
|
||||
logging.info(
|
||||
"exact search backward from the k_extract NN results of"
|
||||
" approximate forward search"
|
||||
)
|
||||
D_a_b_refined = D_a_ann_refined[:, :k_extract].ravel()
|
||||
idx_b_ann = I_a_ann[:, :k_extract].ravel()
|
||||
assert len(idx_b_ann) == self.evaluation_sample * k_extract
|
||||
np.save(idx_b_ann_file, idx_b_ann)
|
||||
# exact search
|
||||
D_b_ann_gt, _ = self._cached_search(
|
||||
len(idx_b_ann),
|
||||
self.xb_ds,
|
||||
self.xq_ds,
|
||||
idx_b_ann_file,
|
||||
vecs_b_ann_file,
|
||||
I_b_ann_gt_file,
|
||||
D_b_ann_gt_file,
|
||||
) # xb and xq ^^^ are inverted
|
||||
|
||||
logging.info("refined margin on approximate search")
|
||||
margin_refined = margin(
|
||||
self.evaluation_sample,
|
||||
idx_a,
|
||||
idx_b_ann,
|
||||
D_a_b_refined,
|
||||
D_a_gt, # not D_a_ann_refined(!)
|
||||
D_b_ann_gt,
|
||||
self.k,
|
||||
k_extract,
|
||||
margin_threshold,
|
||||
)
|
||||
np.save(margin_refined_file, margin_refined)
|
||||
|
||||
D_b_ann, I_b_ann = self._cached_search(
|
||||
len(idx_b_ann),
|
||||
self.xb_ds,
|
||||
self.xq_ds,
|
||||
idx_b_ann_file,
|
||||
vecs_b_ann_file,
|
||||
I_b_ann_file,
|
||||
D_b_ann_file,
|
||||
xq_index_file,
|
||||
nprobe,
|
||||
)
|
||||
|
||||
D_a_b_ann = D_a_ann[:, :k_extract].ravel()
|
||||
|
||||
logging.info("approximate search margin")
|
||||
|
||||
margin_ann = margin(
|
||||
self.evaluation_sample,
|
||||
idx_a,
|
||||
idx_b_ann,
|
||||
D_a_b_ann,
|
||||
D_a_ann,
|
||||
D_b_ann,
|
||||
self.k,
|
||||
k_extract,
|
||||
margin_threshold,
|
||||
)
|
||||
np.save(margin_ann_file, margin_ann)
|
||||
|
||||
logging.info("intersection")
|
||||
logging.info(I_a_gt)
|
||||
logging.info(I_a_ann)
|
||||
|
||||
for i in range(1, self.k + 1):
|
||||
logging.info(
|
||||
f"{i}: {knn_intersection_measure(I_a_gt[:,:i], I_a_ann[:,:i])}"
|
||||
)
|
||||
|
||||
logging.info(f"mean of gt distances: {D_a_gt.mean()}")
|
||||
logging.info(f"mean of approx distances: {D_a_ann.mean()}")
|
||||
logging.info(f"mean of refined distances: {D_a_ann_refined.mean()}")
|
||||
|
||||
logging.info("intersection cardinality frequencies")
|
||||
logging.info(get_intersection_cardinality_frequencies(I_a_ann, I_a_gt))
|
||||
|
||||
logging.info("done")
|
||||
pass
|
||||
|
||||
def _knn_function(self, xq, xb, k, metric, thread_id=None):
|
||||
try:
|
||||
return faiss.knn_gpu(
|
||||
self.all_gpu_resources[thread_id],
|
||||
xq,
|
||||
xb,
|
||||
k,
|
||||
metric=metric,
|
||||
device=thread_id,
|
||||
vectorsMemoryLimit=self.knn_vectors_memory_limit,
|
||||
queriesMemoryLimit=self.knn_queries_memory_limit,
|
||||
)
|
||||
except Exception:
|
||||
logging.info(f"knn_function failed: {xq.shape}, {xb.shape}")
|
||||
raise
|
||||
|
||||
def _coarse_quantize(self, index_ivf, xq, nprobe):
|
||||
assert nprobe <= index_ivf.quantizer.ntotal
|
||||
quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
|
||||
bs = 100_000
|
||||
nq = len(xq)
|
||||
q_assign = np.empty((nq, nprobe), dtype="int32")
|
||||
for i0 in trange(0, nq, bs):
|
||||
i1 = min(nq, i0 + bs)
|
||||
_, q_assign_i = quantizer.search(xq[i0:i1], nprobe)
|
||||
q_assign[i0:i1] = q_assign_i
|
||||
return q_assign
|
||||
|
||||
def search(self):
|
||||
logging.info(f"search: {self.knn_dir}")
|
||||
slurm_job_id = os.environ.get("SLURM_JOB_ID")
|
||||
|
||||
ngpu = faiss.get_num_gpus()
|
||||
logging.info(f"number of gpus: {ngpu}")
|
||||
self.all_gpu_resources = [
|
||||
faiss.StandardGpuResources() for _ in range(ngpu)
|
||||
]
|
||||
self._knn_function(
|
||||
np.zeros((10, 10), dtype=np.float16),
|
||||
np.zeros((10, 10), dtype=np.float16),
|
||||
self.k,
|
||||
metric=self.metric,
|
||||
thread_id=0,
|
||||
)
|
||||
|
||||
index = self._open_sharded_index()
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
logging.info(f"setting nprobe to {self.nprobe}")
|
||||
index_ivf.nprobe = self.nprobe
|
||||
# quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
|
||||
for i in range(0, self.xq_ds.size, self.xq_bs):
|
||||
Ifn = f"{self.knn_dir}/I{(i):010}_{self.knn_output_file_suffix}"
|
||||
Dfn = f"{self.knn_dir}/D_approx{(i):010}_{self.knn_output_file_suffix}"
|
||||
CPfn = f"{self.knn_dir}/CP{(i):010}_{self.knn_output_file_suffix}"
|
||||
|
||||
if slurm_job_id:
|
||||
worker_record = (
|
||||
self.knn_dir
|
||||
+ f"/record_{(i):010}_{self.knn_output_file_suffix}.txt"
|
||||
)
|
||||
if not os.path.exists(worker_record):
|
||||
logging.info(
|
||||
f"creating record file {worker_record} and saving job"
|
||||
f" id: {slurm_job_id}"
|
||||
)
|
||||
with open(worker_record, "w") as h:
|
||||
h.write(slurm_job_id)
|
||||
else:
|
||||
old_slurm_id = open(worker_record, "r").read()
|
||||
logging.info(
|
||||
f"old job slurm id {old_slurm_id} and current job id:"
|
||||
f" {slurm_job_id}"
|
||||
)
|
||||
if old_slurm_id == slurm_job_id:
|
||||
if os.path.getsize(Ifn) == 0:
|
||||
logging.info(
|
||||
f"cleaning up zero length files {Ifn} and"
|
||||
f" {Dfn}"
|
||||
)
|
||||
os.remove(Ifn)
|
||||
os.remove(Dfn)
|
||||
|
||||
try:
|
||||
if is_pretransform_index(index):
|
||||
d = index.chain.at(0).d_out
|
||||
else:
|
||||
d = self.input_d
|
||||
with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
|
||||
xq_i = np.empty(
|
||||
shape=(self.xq_bs, d), dtype=np.float16
|
||||
)
|
||||
q_assign = np.empty(
|
||||
(self.xq_bs, self.nprobe), dtype=np.int32
|
||||
)
|
||||
j = 0
|
||||
quantizer = faiss.index_cpu_to_all_gpus(
|
||||
index_ivf.quantizer
|
||||
)
|
||||
for xq_i_j in tqdm(
|
||||
self._iterate_transformed(
|
||||
self.xq_ds, i, min(100_000, self.xq_bs), np.float16
|
||||
),
|
||||
file=sys.stdout,
|
||||
):
|
||||
xq_i[j:j + xq_i_j.shape[0]] = xq_i_j
|
||||
(
|
||||
_,
|
||||
q_assign[j:j + xq_i_j.shape[0]],
|
||||
) = quantizer.search(xq_i_j, self.nprobe)
|
||||
j += xq_i_j.shape[0]
|
||||
assert j <= xq_i.shape[0]
|
||||
if j == xq_i.shape[0]:
|
||||
break
|
||||
xq_i = xq_i[:j]
|
||||
q_assign = q_assign[:j]
|
||||
|
||||
assert q_assign.shape == (xq_i.shape[0], index_ivf.nprobe)
|
||||
del quantizer
|
||||
logging.info(f"computing: {Ifn}")
|
||||
logging.info(f"computing: {Dfn}")
|
||||
prefetch_threads = faiss.get_num_gpus()
|
||||
D_ann, I = big_batch_search(
|
||||
index_ivf,
|
||||
xq_i,
|
||||
self.k,
|
||||
verbose=10,
|
||||
method="knn_function",
|
||||
knn=self._knn_function,
|
||||
threaded=faiss.get_num_gpus() * 8,
|
||||
use_float16=True,
|
||||
prefetch_threads=prefetch_threads,
|
||||
computation_threads=faiss.get_num_gpus(),
|
||||
q_assign=q_assign,
|
||||
checkpoint=CPfn,
|
||||
checkpoint_freq=7200, # in seconds
|
||||
)
|
||||
assert (
|
||||
np.amin(I) >= 0
|
||||
), f"{I}, there exists negative indices, check"
|
||||
logging.info(f"saving: {Ifn}")
|
||||
np.save(f, I)
|
||||
logging.info(f"saving: {Dfn}")
|
||||
np.save(g, D_ann)
|
||||
|
||||
if os.path.exists(CPfn):
|
||||
logging.info(f"removing: {CPfn}")
|
||||
os.remove(CPfn)
|
||||
|
||||
except FileExistsError:
|
||||
logging.info(f"skipping {Ifn}, already exists")
|
||||
logging.info(f"skipping {Dfn}, already exists")
|
||||
continue
|
||||
|
||||
def _open_index_shard(self, fn):
|
||||
if fn in self.index_shards:
|
||||
index_shard = self.index_shards[fn]
|
||||
else:
|
||||
logging.info(f"open index shard: {fn}")
|
||||
index_shard = faiss.read_index(
|
||||
fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
|
||||
)
|
||||
self.index_shards[fn] = index_shard
|
||||
return index_shard
|
||||
|
||||
def _open_sharded_index(self, index_shard_prefix=None):
|
||||
if index_shard_prefix is None:
|
||||
index_shard_prefix = self.index_shard_prefix
|
||||
if index_shard_prefix in self.index:
|
||||
return self.index[index_shard_prefix]
|
||||
assert os.path.exists(
|
||||
self.index_template_file
|
||||
), f"file {self.index_template_file} does not exist "
|
||||
logging.info(f"open index template: {self.index_template_file}")
|
||||
index = faiss.read_index(self.index_template_file)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
ilv = faiss.InvertedListsPtrVector()
|
||||
for i in range(self.nshards):
|
||||
fn = f"{index_shard_prefix}{i}"
|
||||
assert os.path.exists(fn), f"file {fn} does not exist "
|
||||
logging.info(fn)
|
||||
index_shard = self._open_index_shard(fn)
|
||||
il = faiss.downcast_index(
|
||||
faiss.extract_index_ivf(index_shard)
|
||||
).invlists
|
||||
ilv.push_back(il)
|
||||
hsil = faiss.HStackInvertedLists(ilv.size(), ilv.data())
|
||||
index_ivf.replace_invlists(hsil, False)
|
||||
self.ivls[index_shard_prefix] = hsil
|
||||
self.index[index_shard_prefix] = index
|
||||
return index
|
||||
|
||||
def index_shard_stats(self):
|
||||
for i in range(self.nshards):
|
||||
fn = f"{self.index_shard_prefix}{i}"
|
||||
assert os.path.exists(fn)
|
||||
index = faiss.read_index(
|
||||
fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
|
||||
)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
il = index_ivf.invlists
|
||||
il.print_stats()
|
||||
|
||||
def index_stats(self):
|
||||
index = self._open_sharded_index()
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
il = index_ivf.invlists
|
||||
list_sizes = [il.list_size(i) for i in range(il.nlist)]
|
||||
logging.info(np.max(list_sizes))
|
||||
logging.info(np.mean(list_sizes))
|
||||
logging.info(np.argmax(list_sizes))
|
||||
logging.info("index_stats:")
|
||||
il.print_stats()
|
||||
|
||||
def consistency_check(self):
|
||||
logging.info("consistency-check")
|
||||
|
||||
logging.info("index template...")
|
||||
|
||||
assert os.path.exists(self.index_template_file)
|
||||
index = faiss.read_index(self.index_template_file)
|
||||
|
||||
offset = 0 # 2**24
|
||||
assert self.shard_size > offset + SMALL_DATA_SAMPLE
|
||||
|
||||
logging.info("index shards...")
|
||||
for i in range(self.nshards):
|
||||
r = i * self.shard_size + offset
|
||||
xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
|
||||
fn = f"{self.index_shard_prefix}{i}"
|
||||
assert os.path.exists(fn), f"There is no index shard file {fn}"
|
||||
index = self._open_index_shard(fn)
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
index_ivf.nprobe = 1
|
||||
_, I = index.search(xb, 100)
|
||||
for j in range(SMALL_DATA_SAMPLE):
|
||||
assert np.where(I[j] == j + r)[0].size > 0, (
|
||||
f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
|
||||
f" {self.shard_size}"
|
||||
)
|
||||
|
||||
logging.info("merged index...")
|
||||
index = self._open_sharded_index()
|
||||
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
|
||||
index_ivf.nprobe = 1
|
||||
for i in range(self.nshards):
|
||||
r = i * self.shard_size + offset
|
||||
xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
|
||||
_, I = index.search(xb, 100)
|
||||
for j in range(SMALL_DATA_SAMPLE):
|
||||
assert np.where(I[j] == j + r)[0].size > 0, (
|
||||
f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
|
||||
f" {self.shard_size}")
|
||||
|
||||
logging.info("search results...")
|
||||
index_ivf.nprobe = self.nprobe
|
||||
for i in range(0, self.xq_ds.size, self.xq_bs):
|
||||
Ifn = f"{self.knn_dir}/I{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
|
||||
assert os.path.exists(Ifn)
|
||||
assert os.path.getsize(Ifn) > 0, f"The file {Ifn} is empty."
|
||||
logging.info(Ifn)
|
||||
I = np.load(Ifn, mmap_mode="r")
|
||||
|
||||
assert I.shape[1] == self.k
|
||||
assert I.shape[0] == min(self.xq_bs, self.xq_ds.size - i)
|
||||
assert np.all(I[:, 1] >= 0)
|
||||
|
||||
Dfn = f"{self.knn_dir}/D_approx{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
|
||||
assert os.path.exists(Dfn)
|
||||
assert os.path.getsize(Dfn) > 0, f"The file {Dfn} is empty."
|
||||
logging.info(Dfn)
|
||||
D = np.load(Dfn, mmap_mode="r")
|
||||
assert D.shape == I.shape
|
||||
|
||||
xq = next(self.xq_ds.iterate(i, SMALL_DATA_SAMPLE, np.float32))
|
||||
D_online, I_online = index.search(xq, self.k)
|
||||
assert (
|
||||
np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size
|
||||
/ (self.k * SMALL_DATA_SAMPLE)
|
||||
> 0.95
|
||||
), (
|
||||
"the ratio is"
|
||||
f" {np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size / (self.k * SMALL_DATA_SAMPLE)}"
|
||||
)
|
||||
assert np.allclose(
|
||||
D[:SMALL_DATA_SAMPLE].sum(axis=1),
|
||||
D_online.sum(axis=1),
|
||||
rtol=0.01,
|
||||
), (
|
||||
"the difference is"
|
||||
f" {D[:SMALL_DATA_SAMPLE].sum(axis=1), D_online.sum(axis=1)}"
|
||||
)
|
||||
|
||||
logging.info("done")
|
||||
219
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/run.py
vendored
Normal file
219
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/run.py
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
from utils import (
|
||||
load_config,
|
||||
add_group_args,
|
||||
)
|
||||
from offline_ivf import OfflineIVF
|
||||
import faiss
|
||||
from typing import List, Callable, Dict
|
||||
import submitit
|
||||
|
||||
|
||||
def join_lists_in_dict(poss: List[str]) -> List[str]:
|
||||
"""
|
||||
Joins two lists of prod and non-prod values, checking if the prod value is already included.
|
||||
If there is no non-prod list, it returns the prod list.
|
||||
"""
|
||||
if "non-prod" in poss.keys():
|
||||
all_poss = poss["non-prod"]
|
||||
if poss["prod"][-1] not in poss["non-prod"]:
|
||||
all_poss += poss["prod"]
|
||||
return all_poss
|
||||
else:
|
||||
return poss["prod"]
|
||||
|
||||
|
||||
def main(
|
||||
args: argparse.Namespace,
|
||||
cfg: Dict[str, str],
|
||||
nprobe: int,
|
||||
index_factory_str: str,
|
||||
) -> None:
|
||||
oivf = OfflineIVF(cfg, args, nprobe, index_factory_str)
|
||||
eval(f"oivf.{args.command}()")
|
||||
|
||||
|
||||
def process_options_and_run_jobs(args: argparse.Namespace) -> None:
|
||||
"""
|
||||
If "--cluster_run", it launches an array of jobs to the cluster using the submitit library for all the index strings. In
|
||||
the case of evaluate, it launches a job for each index string and nprobe pair. Otherwise, it launches a single job
|
||||
that is ran locally with the prod values for index string and nprobe.
|
||||
"""
|
||||
|
||||
cfg = load_config(args.config)
|
||||
index_strings = cfg["index"]
|
||||
nprobes = cfg["nprobe"]
|
||||
if args.command == "evaluate":
|
||||
if args.cluster_run:
|
||||
all_nprobes = join_lists_in_dict(nprobes)
|
||||
all_index_strings = join_lists_in_dict(index_strings)
|
||||
for index_factory_str in all_index_strings:
|
||||
for nprobe in all_nprobes:
|
||||
launch_job(main, args, cfg, nprobe, index_factory_str)
|
||||
else:
|
||||
launch_job(
|
||||
main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
|
||||
)
|
||||
else:
|
||||
if args.cluster_run:
|
||||
all_index_strings = join_lists_in_dict(index_strings)
|
||||
for index_factory_str in all_index_strings:
|
||||
launch_job(
|
||||
main, args, cfg, nprobes["prod"][-1], index_factory_str
|
||||
)
|
||||
else:
|
||||
launch_job(
|
||||
main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
|
||||
)
|
||||
|
||||
|
||||
def launch_job(
|
||||
func: Callable,
|
||||
args: argparse.Namespace,
|
||||
cfg: Dict[str, str],
|
||||
n_probe: int,
|
||||
index_str: str,
|
||||
) -> None:
|
||||
"""
|
||||
Launches an array of slurm jobs to the cluster using the submitit library.
|
||||
"""
|
||||
|
||||
if args.cluster_run:
|
||||
assert args.num_nodes >= 1
|
||||
executor = submitit.AutoExecutor(folder=args.logs_dir)
|
||||
|
||||
executor.update_parameters(
|
||||
nodes=args.num_nodes,
|
||||
gpus_per_node=args.gpus_per_node,
|
||||
cpus_per_task=args.cpus_per_task,
|
||||
tasks_per_node=args.tasks_per_node,
|
||||
name=args.job_name,
|
||||
slurm_partition=args.partition,
|
||||
slurm_time=70 * 60,
|
||||
)
|
||||
if args.slurm_constraint:
|
||||
executor.update_parameters(slurm_constraint=args.slurm_constrain)
|
||||
|
||||
job = executor.submit(func, args, cfg, n_probe, index_str)
|
||||
print(f"Job id: {job.job_id}")
|
||||
else:
|
||||
func(args, cfg, n_probe, index_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
group = parser.add_argument_group("general")
|
||||
|
||||
add_group_args(group, "--command", required=True, help="command to run")
|
||||
add_group_args(
|
||||
group,
|
||||
"--config",
|
||||
required=True,
|
||||
help="config yaml with the dataset specs",
|
||||
)
|
||||
add_group_args(
|
||||
group, "--nt", type=int, default=96, help="nb search threads"
|
||||
)
|
||||
add_group_args(
|
||||
group,
|
||||
"--no_residuals",
|
||||
action="store_false",
|
||||
help="set index.by_residual to False during train index.",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group("slurm_job")
|
||||
|
||||
add_group_args(
|
||||
group,
|
||||
"--cluster_run",
|
||||
action="store_true",
|
||||
help=" if True, runs in cluster",
|
||||
)
|
||||
add_group_args(
|
||||
group,
|
||||
"--job_name",
|
||||
type=str,
|
||||
default="oivf",
|
||||
help="cluster job name",
|
||||
)
|
||||
add_group_args(
|
||||
group,
|
||||
"--num_nodes",
|
||||
type=str,
|
||||
default=1,
|
||||
help="num of nodes per job",
|
||||
)
|
||||
add_group_args(
|
||||
group,
|
||||
"--tasks_per_node",
|
||||
type=int,
|
||||
default=1,
|
||||
help="tasks per job",
|
||||
)
|
||||
|
||||
add_group_args(
|
||||
group,
|
||||
"--gpus_per_node",
|
||||
type=int,
|
||||
default=8,
|
||||
help="cluster job name",
|
||||
)
|
||||
add_group_args(
|
||||
group,
|
||||
"--cpus_per_task",
|
||||
type=int,
|
||||
default=80,
|
||||
help="cluster job name",
|
||||
)
|
||||
|
||||
add_group_args(
|
||||
group,
|
||||
"--logs_dir",
|
||||
type=str,
|
||||
default="/checkpoint/marialomeli/offline_faiss/logs",
|
||||
help="cluster job name",
|
||||
)
|
||||
|
||||
add_group_args(
|
||||
group,
|
||||
"--slurm_constraint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="can be volta32gb for the fair cluster",
|
||||
)
|
||||
|
||||
add_group_args(
|
||||
group,
|
||||
"--partition",
|
||||
type=str,
|
||||
default="learnlab",
|
||||
help="specify which partition to use if ran on cluster with job arrays",
|
||||
choices=[
|
||||
"learnfair",
|
||||
"devlab",
|
||||
"scavenge",
|
||||
"learnlab",
|
||||
"nllb",
|
||||
"seamless",
|
||||
"seamless_medium",
|
||||
"learnaccel",
|
||||
"onellm_low",
|
||||
"learn",
|
||||
"scavenge",
|
||||
],
|
||||
)
|
||||
|
||||
group = parser.add_argument_group("dataset")
|
||||
|
||||
add_group_args(group, "--xb", required=True, help="database vectors")
|
||||
add_group_args(group, "--xq", help="query vectors")
|
||||
|
||||
args = parser.parse_args()
|
||||
print("args:", args)
|
||||
faiss.omp_set_num_threads(args.nt)
|
||||
process_options_and_run_jobs(args=args)
|
||||
181
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/tests/testing_utils.py
vendored
Normal file
181
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/tests/testing_utils.py
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import argparse
|
||||
import yaml
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
OIVF_TEST_ARGS: List[str] = [
|
||||
"--config",
|
||||
"--xb",
|
||||
"--xq",
|
||||
"--command",
|
||||
"--cluster_run",
|
||||
"--no_residuals",
|
||||
]
|
||||
|
||||
|
||||
def get_test_parser(args) -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser()
|
||||
for arg in args:
|
||||
parser.add_argument(arg)
|
||||
return parser
|
||||
|
||||
|
||||
class TestDataCreator:
|
||||
def __init__(
|
||||
self,
|
||||
tempdir: str,
|
||||
dimension: int,
|
||||
data_type: np.dtype,
|
||||
index_factory: Optional[List] = ["OPQ4,IVF256,PQ4"],
|
||||
training_sample: Optional[int] = 9984,
|
||||
index_shard_size: Optional[int] = 1000,
|
||||
query_batch_size: Optional[int] = 1000,
|
||||
evaluation_sample: Optional[int] = 100,
|
||||
num_files: Optional[int] = None,
|
||||
file_size: Optional[int] = None,
|
||||
file_sizes: Optional[List] = None,
|
||||
nprobe: Optional[int] = 64,
|
||||
k: Optional[int] = 10,
|
||||
metric: Optional[str] = "METRIC_L2",
|
||||
normalise: Optional[bool] = False,
|
||||
with_queries_ds: Optional[bool] = False,
|
||||
evaluate_by_margin: Optional[bool] = False,
|
||||
) -> None:
|
||||
self.tempdir = tempdir
|
||||
self.dimension = dimension
|
||||
self.data_type = np.dtype(data_type).name
|
||||
self.index_factory = {"prod": index_factory}
|
||||
if file_size and num_files:
|
||||
self.file_sizes = [file_size for _ in range(num_files)]
|
||||
elif file_sizes:
|
||||
self.file_sizes = file_sizes
|
||||
else:
|
||||
raise ValueError("no file sizes provided")
|
||||
self.num_files = len(self.file_sizes)
|
||||
self.training_sample = training_sample
|
||||
self.index_shard_size = index_shard_size
|
||||
self.query_batch_size = query_batch_size
|
||||
self.evaluation_sample = evaluation_sample
|
||||
self.nprobe = {"prod": [nprobe]}
|
||||
self.k = k
|
||||
self.metric = metric
|
||||
self.normalise = normalise
|
||||
self.config_file = self.tempdir + "/config_test.yaml"
|
||||
self.ds_name = "my_test_data"
|
||||
self.qs_name = "my_queries_data"
|
||||
self.evaluate_by_margin = evaluate_by_margin
|
||||
self.with_queries_ds = with_queries_ds
|
||||
|
||||
def create_test_data(self) -> None:
|
||||
datafiles = self._create_data_files()
|
||||
files_info = []
|
||||
|
||||
for i, file in enumerate(datafiles):
|
||||
files_info.append(
|
||||
{
|
||||
"dtype": self.data_type,
|
||||
"format": "npy",
|
||||
"name": file,
|
||||
"size": self.file_sizes[i],
|
||||
}
|
||||
)
|
||||
|
||||
config_for_yaml = {
|
||||
"d": self.dimension,
|
||||
"output": self.tempdir,
|
||||
"index": self.index_factory,
|
||||
"nprobe": self.nprobe,
|
||||
"k": self.k,
|
||||
"normalise": self.normalise,
|
||||
"metric": self.metric,
|
||||
"training_sample": self.training_sample,
|
||||
"evaluation_sample": self.evaluation_sample,
|
||||
"index_shard_size": self.index_shard_size,
|
||||
"query_batch_size": self.query_batch_size,
|
||||
"datasets": {
|
||||
self.ds_name: {
|
||||
"root": self.tempdir,
|
||||
"size": sum(self.file_sizes),
|
||||
"files": files_info,
|
||||
}
|
||||
},
|
||||
}
|
||||
if self.evaluate_by_margin:
|
||||
config_for_yaml["evaluate_by_margin"] = self.evaluate_by_margin
|
||||
q_datafiles = self._create_data_files("my_q_data")
|
||||
q_files_info = []
|
||||
|
||||
for i, file in enumerate(q_datafiles):
|
||||
q_files_info.append(
|
||||
{
|
||||
"dtype": self.data_type,
|
||||
"format": "npy",
|
||||
"name": file,
|
||||
"size": self.file_sizes[i],
|
||||
}
|
||||
)
|
||||
if self.with_queries_ds:
|
||||
config_for_yaml["datasets"][self.qs_name] = {
|
||||
"root": self.tempdir,
|
||||
"size": sum(self.file_sizes),
|
||||
"files": q_files_info,
|
||||
}
|
||||
|
||||
self._create_config_yaml(config_for_yaml)
|
||||
|
||||
def setup_cli(self, command="consistency_check") -> argparse.Namespace:
|
||||
parser = get_test_parser(OIVF_TEST_ARGS)
|
||||
|
||||
if self.with_queries_ds:
|
||||
return parser.parse_args(
|
||||
[
|
||||
"--xb",
|
||||
self.ds_name,
|
||||
"--config",
|
||||
self.config_file,
|
||||
"--command",
|
||||
command,
|
||||
"--xq",
|
||||
self.qs_name,
|
||||
]
|
||||
)
|
||||
return parser.parse_args(
|
||||
[
|
||||
"--xb",
|
||||
self.ds_name,
|
||||
"--config",
|
||||
self.config_file,
|
||||
"--command",
|
||||
command,
|
||||
]
|
||||
)
|
||||
|
||||
def _create_data_files(self, name_of_file="my_data") -> List[str]:
|
||||
"""
|
||||
Creates a dataset "my_test_data" with number of files (num_files), using padding in the files
|
||||
name. If self.with_queries is True, it adds an extra dataset "my_queries_data" with the same number of files
|
||||
as the "my_test_data". The default name for embeddings files is "my_data" + <padding>.npy.
|
||||
"""
|
||||
filenames = []
|
||||
for i, file_size in enumerate(self.file_sizes):
|
||||
# np.random.seed(i)
|
||||
db_vectors = np.random.random((file_size, self.dimension)).astype(
|
||||
self.data_type
|
||||
)
|
||||
filename = name_of_file + f"{i:02}" + ".npy"
|
||||
filenames.append(filename)
|
||||
np.save(self.tempdir + "/" + filename, db_vectors)
|
||||
return filenames
|
||||
|
||||
def _create_config_yaml(self, dict_file: Dict[str, str]) -> None:
|
||||
"""
|
||||
Creates a yaml file in dir (can be a temporary dir for tests).
|
||||
"""
|
||||
filename = self.tempdir + "/config_test.yaml"
|
||||
with open(filename, "w") as file:
|
||||
yaml.dump(dict_file, file, default_flow_style=False)
|
||||
95
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/utils.py
vendored
Normal file
95
packages/leann-backend-hnsw/third_party/faiss/demos/offline_ivf/utils.py
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
from typing import Dict
|
||||
import yaml
|
||||
import faiss
|
||||
from faiss.contrib.datasets import SyntheticDataset
|
||||
|
||||
|
||||
def load_config(config):
|
||||
assert os.path.exists(config)
|
||||
with open(config, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def faiss_sanity_check():
|
||||
ds = SyntheticDataset(256, 0, 100, 100)
|
||||
xq = ds.get_queries()
|
||||
xb = ds.get_database()
|
||||
index_cpu = faiss.IndexFlat(ds.d)
|
||||
index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
|
||||
index_cpu.add(xb)
|
||||
index_gpu.add(xb)
|
||||
D_cpu, I_cpu = index_cpu.search(xq, 10)
|
||||
D_gpu, I_gpu = index_gpu.search(xq, 10)
|
||||
assert np.all(I_cpu == I_gpu), "faiss sanity check failed"
|
||||
assert np.all(np.isclose(D_cpu, D_gpu)), "faiss sanity check failed"
|
||||
|
||||
|
||||
def margin(sample, idx_a, idx_b, D_a_b, D_a, D_b, k, k_extract, threshold):
|
||||
"""
|
||||
two datasets: xa, xb; n = number of pairs
|
||||
idx_a - (np,) - query vector ids in xa
|
||||
idx_b - (np,) - query vector ids in xb
|
||||
D_a_b - (np,) - pairwise distances between xa[idx_a] and xb[idx_b]
|
||||
D_a - (np, k) - distances between vectors xa[idx_a] and corresponding nearest neighbours in xb
|
||||
D_b - (np, k) - distances between vectors xb[idx_b] and corresponding nearest neighbours in xa
|
||||
k - k nearest neighbours used for margin
|
||||
k_extract - number of nearest neighbours of each query in xb we consider for margin calculation and filtering
|
||||
threshold - margin threshold
|
||||
"""
|
||||
|
||||
n = sample
|
||||
nk = n * k_extract
|
||||
assert idx_a.shape == (n,)
|
||||
idx_a_k = idx_a.repeat(k_extract)
|
||||
assert idx_a_k.shape == (nk,)
|
||||
assert idx_b.shape == (nk,)
|
||||
assert D_a_b.shape == (nk,)
|
||||
assert D_a.shape == (n, k)
|
||||
assert D_b.shape == (nk, k)
|
||||
mean_a = np.mean(D_a, axis=1)
|
||||
assert mean_a.shape == (n,)
|
||||
mean_a_k = mean_a.repeat(k_extract)
|
||||
assert mean_a_k.shape == (nk,)
|
||||
mean_b = np.mean(D_b, axis=1)
|
||||
assert mean_b.shape == (nk,)
|
||||
margin = 2 * D_a_b / (mean_a_k + mean_b)
|
||||
above_threshold = margin > threshold
|
||||
print(np.count_nonzero(above_threshold))
|
||||
print(idx_a_k[above_threshold])
|
||||
print(idx_b[above_threshold])
|
||||
print(margin[above_threshold])
|
||||
return margin
|
||||
|
||||
|
||||
def add_group_args(group, *args, **kwargs):
|
||||
return group.add_argument(*args, **kwargs)
|
||||
|
||||
|
||||
def get_intersection_cardinality_frequencies(
|
||||
I: np.ndarray, I_gt: np.ndarray
|
||||
) -> Dict[int, int]:
|
||||
"""
|
||||
Computes the frequencies for the cardinalities of the intersection of neighbour indices.
|
||||
"""
|
||||
nq = I.shape[0]
|
||||
res = []
|
||||
for ell in range(nq):
|
||||
res.append(len(np.intersect1d(I[ell, :], I_gt[ell, :])))
|
||||
values, counts = np.unique(res, return_counts=True)
|
||||
return dict(zip(values, counts))
|
||||
|
||||
|
||||
def is_pretransform_index(index):
|
||||
if index.__class__ == faiss.IndexPreTransform:
|
||||
assert hasattr(index, "chain")
|
||||
return True
|
||||
else:
|
||||
assert not hasattr(index, "chain")
|
||||
return False
|
||||
13
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/CMakeLists.txt
vendored
Normal file
13
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
|
||||
project (ROCKSDB_IVF)
|
||||
set(CMAKE_BUILD_TYPE Debug)
|
||||
find_package(faiss REQUIRED)
|
||||
find_package(RocksDB REQUIRED)
|
||||
|
||||
add_executable(demo_rocksdb_ivf demo_rocksdb_ivf.cpp RocksDBInvertedLists.cpp)
|
||||
target_link_libraries(demo_rocksdb_ivf faiss RocksDB::rocksdb)
|
||||
23
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/README.md
vendored
Normal file
23
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/README.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Storing Faiss inverted lists in RocksDB
|
||||
|
||||
Demo of storing the inverted lists of any IVF index in RocksDB or any similar key-value store which supports the prefix scan operation.
|
||||
|
||||
# How to build
|
||||
|
||||
We use conda to create the build environment for simplicity. Only tested on Linux x86.
|
||||
|
||||
```
|
||||
conda create -n rocksdb_ivf
|
||||
conda activate rocksdb_ivf
|
||||
conda install pytorch::faiss-cpu conda-forge::rocksdb cmake make gxx_linux-64 sysroot_linux-64
|
||||
cd ~/faiss/demos/rocksdb_ivf
|
||||
cmake -B build .
|
||||
make -C build -j$(nproc)
|
||||
```
|
||||
|
||||
# Run the example
|
||||
|
||||
```
|
||||
cd ~/faiss/demos/rocksdb_ivf/build
|
||||
./rocksdb_ivf test_db
|
||||
```
|
||||
114
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
vendored
Normal file
114
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include "RocksDBInvertedLists.h"
|
||||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
namespace faiss_rocksdb {
|
||||
|
||||
RocksDBInvertedListsIterator::RocksDBInvertedListsIterator(
|
||||
rocksdb::DB* db,
|
||||
size_t list_no,
|
||||
size_t code_size)
|
||||
: InvertedListsIterator(),
|
||||
it(db->NewIterator(rocksdb::ReadOptions())),
|
||||
list_no(list_no),
|
||||
code_size(code_size),
|
||||
codes(code_size) {
|
||||
it->Seek(rocksdb::Slice(
|
||||
reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
|
||||
}
|
||||
|
||||
bool RocksDBInvertedListsIterator::is_available() const {
|
||||
return it->Valid() &&
|
||||
it->key().starts_with(rocksdb::Slice(
|
||||
reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
|
||||
}
|
||||
|
||||
void RocksDBInvertedListsIterator::next() {
|
||||
it->Next();
|
||||
}
|
||||
|
||||
std::pair<idx_t, const uint8_t*> RocksDBInvertedListsIterator::
|
||||
get_id_and_codes() {
|
||||
idx_t id =
|
||||
*reinterpret_cast<const idx_t*>(&it->key().data()[sizeof(size_t)]);
|
||||
assert(code_size == it->value().size());
|
||||
return {id, reinterpret_cast<const uint8_t*>(it->value().data())};
|
||||
}
|
||||
|
||||
RocksDBInvertedLists::RocksDBInvertedLists(
|
||||
const char* db_directory,
|
||||
size_t nlist,
|
||||
size_t code_size)
|
||||
: InvertedLists(nlist, code_size) {
|
||||
use_iterator = true;
|
||||
|
||||
rocksdb::Options options;
|
||||
options.create_if_missing = true;
|
||||
rocksdb::DB* db;
|
||||
rocksdb::Status status = rocksdb::DB::Open(options, db_directory, &db);
|
||||
db_ = std::unique_ptr<rocksdb::DB>(db);
|
||||
assert(status.ok());
|
||||
}
|
||||
|
||||
size_t RocksDBInvertedLists::list_size(size_t /*list_no*/) const {
|
||||
FAISS_THROW_MSG("list_size is not supported");
|
||||
}
|
||||
|
||||
const uint8_t* RocksDBInvertedLists::get_codes(size_t /*list_no*/) const {
|
||||
FAISS_THROW_MSG("get_codes is not supported");
|
||||
}
|
||||
|
||||
const idx_t* RocksDBInvertedLists::get_ids(size_t /*list_no*/) const {
|
||||
FAISS_THROW_MSG("get_ids is not supported");
|
||||
}
|
||||
|
||||
size_t RocksDBInvertedLists::add_entries(
|
||||
size_t list_no,
|
||||
size_t n_entry,
|
||||
const idx_t* ids,
|
||||
const uint8_t* code) {
|
||||
rocksdb::WriteOptions wo;
|
||||
std::vector<char> key(sizeof(size_t) + sizeof(idx_t));
|
||||
memcpy(key.data(), &list_no, sizeof(size_t));
|
||||
for (size_t i = 0; i < n_entry; i++) {
|
||||
memcpy(key.data() + sizeof(size_t), ids + i, sizeof(idx_t));
|
||||
rocksdb::Status status = db_->Put(
|
||||
wo,
|
||||
rocksdb::Slice(key.data(), key.size()),
|
||||
rocksdb::Slice(
|
||||
reinterpret_cast<const char*>(code + i * code_size),
|
||||
code_size));
|
||||
assert(status.ok());
|
||||
}
|
||||
return 0; // ignored
|
||||
}
|
||||
|
||||
void RocksDBInvertedLists::update_entries(
|
||||
size_t /*list_no*/,
|
||||
size_t /*offset*/,
|
||||
size_t /*n_entry*/,
|
||||
const idx_t* /*ids*/,
|
||||
const uint8_t* /*code*/) {
|
||||
FAISS_THROW_MSG("update_entries is not supported");
|
||||
}
|
||||
|
||||
void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
|
||||
FAISS_THROW_MSG("resize is not supported");
|
||||
}
|
||||
|
||||
InvertedListsIterator* RocksDBInvertedLists::get_iterator(
|
||||
size_t list_no,
|
||||
void* inverted_list_context) const {
|
||||
return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
|
||||
}
|
||||
|
||||
} // namespace faiss_rocksdb
|
||||
67
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h
vendored
Normal file
67
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <faiss/invlists/InvertedLists.h>
|
||||
|
||||
#include <rocksdb/db.h>
|
||||
|
||||
namespace faiss_rocksdb {
|
||||
|
||||
struct RocksDBInvertedListsIterator : faiss::InvertedListsIterator {
|
||||
RocksDBInvertedListsIterator(
|
||||
rocksdb::DB* db,
|
||||
size_t list_no,
|
||||
size_t code_size);
|
||||
virtual bool is_available() const override;
|
||||
virtual void next() override;
|
||||
virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes() override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<rocksdb::Iterator> it;
|
||||
size_t list_no;
|
||||
size_t code_size;
|
||||
std::vector<uint8_t> codes; // buffer for returning codes in next()
|
||||
};
|
||||
|
||||
struct RocksDBInvertedLists : faiss::InvertedLists {
|
||||
RocksDBInvertedLists(
|
||||
const char* db_directory,
|
||||
size_t nlist,
|
||||
size_t code_size);
|
||||
|
||||
size_t list_size(size_t list_no) const override;
|
||||
const uint8_t* get_codes(size_t list_no) const override;
|
||||
const faiss::idx_t* get_ids(size_t list_no) const override;
|
||||
|
||||
size_t add_entries(
|
||||
size_t list_no,
|
||||
size_t n_entry,
|
||||
const faiss::idx_t* ids,
|
||||
const uint8_t* code) override;
|
||||
|
||||
void update_entries(
|
||||
size_t list_no,
|
||||
size_t offset,
|
||||
size_t n_entry,
|
||||
const faiss::idx_t* ids,
|
||||
const uint8_t* code) override;
|
||||
|
||||
void resize(size_t list_no, size_t new_size) override;
|
||||
|
||||
faiss::InvertedListsIterator* get_iterator(
|
||||
size_t list_no,
|
||||
void* inverted_list_context) const override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<rocksdb::DB> db_;
|
||||
};
|
||||
|
||||
} // namespace faiss_rocksdb
|
||||
88
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
vendored
Normal file
88
packages/leann-backend-hnsw/third_party/faiss/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <exception>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "RocksDBInvertedLists.h"
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexIVFFlat.h>
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
#include <faiss/impl/FaissException.h>
|
||||
#include <faiss/utils/random.h>
|
||||
|
||||
using namespace faiss;
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
try {
|
||||
if (argc != 2) {
|
||||
std::cerr << "missing db directory argument" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
size_t d = 128;
|
||||
size_t nlist = 100;
|
||||
IndexFlatL2 quantizer(d);
|
||||
IndexIVFFlat index(&quantizer, d, nlist);
|
||||
faiss_rocksdb::RocksDBInvertedLists ril(
|
||||
argv[1], nlist, index.code_size);
|
||||
index.replace_invlists(&ril, false);
|
||||
|
||||
idx_t nb = 10000;
|
||||
std::vector<float> xb(d * nb);
|
||||
float_rand(xb.data(), d * nb, 12345);
|
||||
std::vector<idx_t> xids(nb);
|
||||
std::iota(xids.begin(), xids.end(), 0);
|
||||
|
||||
index.train(nb, xb.data());
|
||||
index.add_with_ids(nb, xb.data(), xids.data());
|
||||
|
||||
idx_t nq = 20; // nb;
|
||||
index.nprobe = 2;
|
||||
|
||||
std::cout << "search" << std::endl;
|
||||
idx_t k = 5;
|
||||
std::vector<float> distances(nq * k);
|
||||
std::vector<idx_t> labels(nq * k, -1);
|
||||
index.search(
|
||||
nq, xb.data(), k, distances.data(), labels.data(), nullptr);
|
||||
|
||||
for (idx_t iq = 0; iq < nq; iq++) {
|
||||
std::cout << iq << ": ";
|
||||
for (auto j = 0; j < k; j++) {
|
||||
std::cout << labels[iq * k + j] << " " << distances[iq * k + j]
|
||||
<< " | ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
std::cout << std::endl << "range search" << std::endl;
|
||||
float range = 15.0f;
|
||||
RangeSearchResult result(nq);
|
||||
index.range_search(nq, xb.data(), range, &result);
|
||||
|
||||
for (idx_t iq = 0; iq < nq; iq++) {
|
||||
std::cout << iq << ": ";
|
||||
for (auto j = result.lims[iq]; j < result.lims[iq + 1]; j++) {
|
||||
std::cout << result.labels[j] << " " << result.distances[j]
|
||||
<< " | ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
} catch (FaissException& e) {
|
||||
std::cerr << e.what() << '\n';
|
||||
} catch (std::exception& e) {
|
||||
std::cerr << e.what() << '\n';
|
||||
} catch (...) {
|
||||
std::cerr << "Unrecognized exception!\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user