Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
add_executable(demo_imi_flat EXCLUDE_FROM_ALL demo_imi_flat.cpp)
target_link_libraries(demo_imi_flat PRIVATE faiss)
add_executable(demo_imi_pq EXCLUDE_FROM_ALL demo_imi_pq.cpp)
target_link_libraries(demo_imi_pq PRIVATE faiss)
add_executable(demo_ivfpq_indexing EXCLUDE_FROM_ALL demo_ivfpq_indexing.cpp)
target_link_libraries(demo_ivfpq_indexing PRIVATE faiss)
add_executable(demo_nndescent EXCLUDE_FROM_ALL demo_nndescent.cpp)
target_link_libraries(demo_nndescent PRIVATE faiss)
add_executable(demo_sift1M EXCLUDE_FROM_ALL demo_sift1M.cpp)
target_link_libraries(demo_sift1M PRIVATE faiss)
add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp)
target_link_libraries(demo_weighted_kmeans PRIVATE faiss)
add_executable(demo_residual_quantizer EXCLUDE_FROM_ALL demo_residual_quantizer.cpp)
target_link_libraries(demo_residual_quantizer PRIVATE faiss)

View File

@@ -0,0 +1,28 @@
Demos for a few Faiss functionalities
=====================================
demo_auto_tune.py
-----------------
Demonstrates the auto-tuning functionality of Faiss
demo_ondisk_ivf.py
------------------
Shows how to construct a Faiss index that stores the inverted file
data on disk, eg. when it does not fit in RAM. The script works on a
small dataset (sift1M) for demonstration and proceeds in stages:
0: train on the dataset
1-4: build 4 indexes, each containing 1/4 of the dataset. This can be
done in parallel on several machines
5: merge the 4 indexes into one that is written directly to disk
(needs not to fit in RAM)
6: load and test the index

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import os
import time
import numpy as np
try:
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
graphical_output = True
except ImportError:
graphical_output = False
import faiss
#################################################################
# Small I/O functions
#################################################################
def ivecs_read(fname):
a = np.fromfile(fname, dtype="int32")
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
def plot_OperatingPoints(ops, nq, **kwargs):
ops = ops.optimal_pts
n = ops.size() * 2 - 1
pyplot.plot([ops.at( i // 2).perf for i in range(n)],
[ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
**kwargs)
#################################################################
# prepare common data for all indexes
#################################################################
t0 = time.time()
print("load data")
xt = fvecs_read("sift1M/sift_learn.fvecs")
xb = fvecs_read("sift1M/sift_base.fvecs")
xq = fvecs_read("sift1M/sift_query.fvecs")
d = xt.shape[1]
print("load GT")
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
gt = gt.astype('int64')
k = gt.shape[1]
print("prepare criterion")
# criterion = 1-recall at 1
crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1)
crit.set_groundtruth(None, gt)
crit.nnn = k
# indexes that are useful when there is no limitation on memory usage
unlimited_mem_keys = [
"IMI2x10,Flat", "IMI2x11,Flat",
"IVF4096,Flat", "IVF16384,Flat",
"PCA64,IMI2x10,Flat"]
# memory limited to 16 bytes / vector
keys_mem_16 = [
'IMI2x10,PQ16', 'IVF4096,PQ16',
'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16'
]
# limited to 32 bytes / vector
keys_mem_32 = [
'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32',
'IMI2x10,PQ16+16',
'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16'
]
# indexes that can run on the GPU
keys_gpu = [
"PCA64,IVF4096,Flat",
"PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
"IVF4096,PQ32"]
keys_to_test = unlimited_mem_keys
use_gpu = False
if use_gpu:
# if this fails, it means that the GPU version was not comp
assert faiss.StandardGpuResources, \
"Faiss was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
res = faiss.StandardGpuResources()
dev_no = 0
# remember results from other index types
op_per_key = []
# keep track of optimal operating points seen so far
op = faiss.OperatingPoints()
for index_key in keys_to_test:
print("============ key", index_key)
# make the index described by the key
index = faiss.index_factory(d, index_key)
if use_gpu:
# transfer to GPU (may be partial)
index = faiss.index_cpu_to_gpu(res, dev_no, index)
params = faiss.GpuParameterSpace()
else:
params = faiss.ParameterSpace()
params.initialize(index)
print("[%.3f s] train & add" % (time.time() - t0))
index.train(xt)
index.add(xb)
print("[%.3f s] explore op points" % (time.time() - t0))
# find operating points for this index
opi = params.explore(index, xq, crit)
print("[%.3f s] result operating points:" % (time.time() - t0))
opi.display()
# update best operating points so far
op.merge_with(opi, index_key + " ")
op_per_key.append((index_key, opi))
if graphical_output:
# graphical output (to tmp/ subdirectory)
fig = pyplot.figure(figsize=(12, 9))
pyplot.xlabel("1-recall at 1")
pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads())
pyplot.gca().set_yscale('log')
pyplot.grid()
for i2, opi2 in op_per_key:
plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o')
# plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r')
pyplot.legend(loc=2)
fig.savefig('tmp/demo_auto_tune.png')
print("[%.3f s] final result:" % (time.time() - t0))
op.display()

View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
import numpy as np
import faiss
from faiss.contrib.client_server import run_index_server, ClientIndex
#################################################################
# Small I/O functions
#################################################################
def ivecs_read(fname):
a = np.fromfile(fname, dtype='int32')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
#################################################################
# Main program
#################################################################
stage = int(sys.argv[1])
tmpdir = '/tmp/'
if stage == 0:
# train the index
xt = fvecs_read("sift1M/sift_learn.fvecs")
index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
print("training index")
index.train(xt)
print("write " + tmpdir + "trained.index")
faiss.write_index(index, tmpdir + "trained.index")
if 1 <= stage <= 4:
# add 1/4 of the database to 4 independent indexes
bno = stage - 1
xb = fvecs_read("sift1M/sift_base.fvecs")
i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
index = faiss.read_index(tmpdir + "trained.index")
print("adding vectors %d:%d" % (i0, i1))
index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
print("write " + tmpdir + "block_%d.index" % bno)
faiss.write_index(index, tmpdir + "block_%d.index" % bno)
machine_ports = [
('localhost', 12010),
('localhost', 12011),
('localhost', 12012),
('localhost', 12013),
]
v6 = False
if 5 <= stage <= 8:
# load an index slice and launch index
bno = stage - 5
fname = tmpdir + "block_%d.index" % bno
print("read " + fname)
index = faiss.read_index(fname)
port = machine_ports[bno][1]
run_index_server(index, port, v6=v6)
if stage == 9:
client_index = ClientIndex(machine_ports)
print('index size:', client_index.ntotal)
client_index.set_nprobe(16)
# load query vectors and ground-truth
xq = fvecs_read("sift1M/sift_query.fvecs")
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
D, I = client_index.search(xq, 5)
recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
print("recall@1: %.3f" % recall_at_1)

View File

@@ -0,0 +1,173 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
import torch.distributed
import faiss
import faiss.contrib.torch_utils
from faiss.contrib.torch import clustering
from faiss.contrib import datasets
class DatasetAssignDistributedGPU(clustering.DatasetAssign):
"""
There is one instance per worker, each worker has a dataset shard.
The non-master workers do not run through the k-means function, so some
code has run it to keep the workers in sync.
"""
def __init__(self, res, x, rank, nproc):
clustering.DatasetAssign.__init__(self, x)
self.res = res
self.rank = rank
self.nproc = nproc
self.device = x.device
n = len(x)
sizes = torch.zeros(nproc, device=self.device, dtype=torch.int64)
sizes[rank] = n
torch.distributed.all_gather(
[sizes[i:i + 1] for i in range(nproc)], sizes[rank:rank + 1])
self.sizes = sizes.cpu().numpy()
# begin & end of each shard
self.cs = np.zeros(nproc + 1, dtype='int64')
self.cs[1:] = np.cumsum(self.sizes)
def count(self):
return int(self.sizes.sum())
def int_to_slaves(self, i):
" broadcast an int to all workers "
rank = self.rank
tab = torch.zeros(1, device=self.device, dtype=torch.int64)
if rank == 0:
tab[0] = i
else:
assert i is None
torch.distributed.broadcast(tab, 0)
return tab.item()
def get_subset(self, indices):
rank = self.rank
assert rank == 0 or indices is None
len_indices = self.int_to_slaves(len(indices) if rank == 0 else None)
if rank == 0:
indices = torch.from_numpy(indices).to(self.device)
else:
indices = torch.zeros(
len_indices, dtype=torch.int64, device=self.device)
torch.distributed.broadcast(indices, 0)
# select subset of indices
i0, i1 = self.cs[rank], self.cs[rank + 1]
mask = torch.logical_and(indices < i1, indices >= i0)
output = torch.zeros(
len_indices, self.x.shape[1],
dtype=self.x.dtype, device=self.device)
output[mask] = self.x[indices[mask] - i0]
torch.distributed.reduce(output, 0) # sum
if rank == 0:
return output
else:
return None
def perform_search(self, centroids):
assert False, "shoudl not be called"
def assign_to(self, centroids, weights=None):
assert weights is None
rank, nproc = self.rank, self.nproc
assert rank == 0 or centroids is None
nc = self.int_to_slaves(len(centroids) if rank == 0 else None)
if rank != 0:
centroids = torch.zeros(
nc, self.x.shape[1], dtype=self.x.dtype, device=self.device)
torch.distributed.broadcast(centroids, 0)
# perform search
D, I = faiss.knn_gpu(
self.res, self.x, centroids, 1, device=self.device.index)
I = I.ravel()
D = D.ravel()
sum_per_centroid = torch.zeros_like(centroids)
if weights is None:
sum_per_centroid.index_add_(0, I, self.x)
else:
sum_per_centroid.index_add_(0, I, self.x * weights[:, None])
torch.distributed.reduce(sum_per_centroid, 0)
if rank == 0:
# gather deos not support tensors of different sizes
# should be implemented with point-to-point communication
assert np.all(self.sizes == self.sizes[0])
device = self.device
all_I = torch.zeros(self.count(), dtype=I.dtype, device=device)
all_D = torch.zeros(self.count(), dtype=D.dtype, device=device)
torch.distributed.gather(
I, [all_I[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
dst=0,
)
torch.distributed.gather(
D, [all_D[self.cs[r]:self.cs[r + 1]] for r in range(nproc)],
dst=0,
)
return all_I.cpu().numpy(), all_D, sum_per_centroid
else:
torch.distributed.gather(I, None, dst=0)
torch.distributed.gather(D, None, dst=0)
return None
if __name__ == "__main__":
torch.distributed.init_process_group(
backend="nccl",
)
rank = torch.distributed.get_rank()
nproc = torch.distributed.get_world_size()
# current version does only support shards of the same size
ds = datasets.SyntheticDataset(32, 10000, 0, 0, seed=1234 + rank)
x = ds.get_train()
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
x = torch.from_numpy(x).to(device)
res = faiss.StandardGpuResources()
da = DatasetAssignDistributedGPU(res, x, rank, nproc)
k = 1000
niter = 25
if rank == 0:
print(f"sizes = {da.sizes}")
centroids, iteration_stats = clustering.kmeans(
k, da, niter=niter, return_stats=True)
print("clusters:", centroids.cpu().numpy())
else:
# make sure the iterations are aligned with master
da.get_subset(None)
for _ in range(niter):
da.assign_to(None)
torch.distributed.barrier()
print("Done")

View File

@@ -0,0 +1,155 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <random>
#include <sys/time.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFFlat.h>
#include <faiss/IndexPQ.h>
double elapsed() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
int main() {
double t0 = elapsed();
// dimension of the vectors to index
int d = 128;
// size of the database we plan to index
size_t nb = 1000 * 1000;
// make a set of nt training vectors in the unit cube
// (could be the database)
size_t nt = 100 * 1000;
//---------------------------------------------------------------
// Define the core quantizer
// We choose a multiple inverted index for faster training with less data
// and because it usually offers best accuracy/speed trade-offs
//
// We here assume that its lifespan of this coarse quantizer will cover the
// lifespan of the inverted-file quantizer IndexIVFFlat below
// With dynamic allocation, one may give the responsibility to free the
// quantizer to the inverted-file index (with attribute do_delete_quantizer)
//
// Note: a regular clustering algorithm would be defined as:
// faiss::IndexFlatL2 coarse_quantizer (d);
//
// Use nhash=2 subquantizers used to define the product coarse quantizer
// Number of bits: we will have 2^nbits_coarse centroids per subquantizer
// meaning (2^12)^nhash distinct inverted lists
size_t nhash = 2;
size_t nbits_subq = int(log2(nb + 1) / 2); // good choice in general
size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
nhash,
nbits_subq,
ncentroids,
nb);
// the coarse quantizer should not be dealloced before the index
// 4 = nb of bytes per code (d must be a multiple of this)
// 8 = nb of bits per sub-code (almost always 8)
faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
faiss::IndexIVFFlat index(&coarse_quantizer, d, ncentroids, metric);
index.quantizer_trains_alone = true;
// define the number of probes. 2048 is for high-dim, overkilled in practice
// Use 4-1024 depending on the trade-off speed accuracy that you want
index.nprobe = 2048;
std::mt19937 rng;
std::uniform_real_distribution<> distrib;
{ // training
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
elapsed() - t0,
nt,
d);
std::vector<float> trainvecs(nt * d);
for (size_t i = 0; i < nt * d; i++) {
trainvecs[i] = distrib(rng);
}
printf("[%.3f s] Training the index\n", elapsed() - t0);
index.verbose = true;
index.train(nt, trainvecs.data());
}
size_t nq;
std::vector<float> queries;
{ // populating the database
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
elapsed() - t0,
nb);
std::vector<float> database(nb * d);
for (size_t i = 0; i < nb * d; i++) {
database[i] = distrib(rng);
}
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
index.add(nb, database.data());
// remember a few elements from the database as queries
int i0 = 1234;
int i1 = 1244;
nq = i1 - i0;
queries.resize(nq * d);
for (int i = i0; i < i1; i++) {
for (int j = 0; j < d; j++) {
queries[(i - i0) * d + j] = database[i * d + j];
}
}
}
{ // searching the database
int k = 5;
printf("[%.3f s] Searching the %d nearest neighbors "
"of %ld vectors in the index\n",
elapsed() - t0,
k,
nq);
std::vector<faiss::idx_t> nns(k * nq);
std::vector<float> dis(k * nq);
index.search(nq, queries.data(), k, dis.data(), nns.data());
printf("[%.3f s] Query results (vector ids, then distances):\n",
elapsed() - t0);
for (int i = 0; i < nq; i++) {
printf("query %2d: ", i);
for (int j = 0; j < k; j++) {
printf("%7ld ", nns[j + i * k]);
}
printf("\n dis: ");
for (int j = 0; j < k; j++) {
printf("%7g ", dis[j + i * k]);
}
printf("\n");
}
}
return 0;
}

View File

@@ -0,0 +1,207 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <random>
#include <sys/time.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFPQ.h>
#include <faiss/IndexPQ.h>
#include <faiss/index_io.h>
double elapsed() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
int main() {
double t0 = elapsed();
// dimension of the vectors to index
int d = 64;
// size of the database we plan to index
size_t nb = 1000 * 1000;
size_t add_bs = 10000; // # size of the blocks to add
// make a set of nt training vectors in the unit cube
// (could be the database)
size_t nt = 100 * 1000;
//---------------------------------------------------------------
// Define the core quantizer
// We choose a multiple inverted index for faster training with less data
// and because it usually offers best accuracy/speed trade-offs
//
// We here assume that its lifespan of this coarse quantizer will cover the
// lifespan of the inverted-file quantizer IndexIVFFlat below
// With dynamic allocation, one may give the responsibility to free the
// quantizer to the inverted-file index (with attribute do_delete_quantizer)
//
// Note: a regular clustering algorithm would be defined as:
// faiss::IndexFlatL2 coarse_quantizer (d);
//
// Use nhash=2 subquantizers used to define the product coarse quantizer
// Number of bits: we will have 2^nbits_coarse centroids per subquantizer
// meaning (2^12)^nhash distinct inverted lists
//
// The parameter bytes_per_code is determined by the memory
// constraint, the dataset will use nb * (bytes_per_code + 8)
// bytes.
//
// The parameter nbits_subq is determined by the size of the dataset to
// index.
//
size_t nhash = 2;
size_t nbits_subq = 9;
size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
int bytes_per_code = 16;
faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
nhash,
nbits_subq,
ncentroids,
nb);
// the coarse quantizer should not be dealloced before the index
// 4 = nb of bytes per code (d must be a multiple of this)
// 8 = nb of bits per sub-code (almost always 8)
faiss::IndexIVFPQ index(
&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
index.quantizer_trains_alone = true;
// define the number of probes. 2048 is for high-dim, overkill in practice
// Use 4-1024 depending on the trade-off speed accuracy that you want
index.nprobe = 2048;
std::mt19937 rng;
std::uniform_real_distribution<> distrib;
{ // training.
// The distribution of the training vectors should be the same
// as the database vectors. It could be a sub-sample of the
// database vectors, if sampling is not biased. Here we just
// randomly generate the vectors.
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
elapsed() - t0,
nt,
d);
std::vector<float> trainvecs(nt * d);
for (size_t i = 0; i < nt; i++) {
for (size_t j = 0; j < d; j++) {
trainvecs[i * d + j] = distrib(rng);
}
}
printf("[%.3f s] Training the index\n", elapsed() - t0);
index.verbose = true;
index.train(nt, trainvecs.data());
}
// the index can be re-loaded later with
// faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
faiss::write_index(&index, "/tmp/trained_index.faissindex");
size_t nq;
std::vector<float> queries;
{ // populating the database
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
elapsed() - t0,
nb);
std::vector<float> database(nb * d);
std::vector<faiss::idx_t> ids(nb);
for (size_t i = 0; i < nb; i++) {
for (size_t j = 0; j < d; j++) {
database[i * d + j] = distrib(rng);
}
ids[i] = 8760000000L + i;
}
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
for (size_t begin = 0; begin < nb; begin += add_bs) {
size_t end = std::min(begin + add_bs, nb);
index.add_with_ids(
end - begin,
database.data() + d * begin,
ids.data() + begin);
}
// remember a few elements from the database as queries
int i0 = 1234;
int i1 = 1244;
nq = i1 - i0;
queries.resize(nq * d);
for (int i = i0; i < i1; i++) {
for (int j = 0; j < d; j++) {
queries[(i - i0) * d + j] = database[i * d + j];
}
}
}
// A few notes on the internal format of the index:
//
// - the positing lists for PQ codes are index.codes, which is a
// std::vector < std::vector<uint8_t> >
// if n is the length of posting list #i, codes[i] has length
// bytes_per_code * n
//
// - the corresponding ids are stored in index.ids
//
// - given a vector float *x, finding which k centroids are
// closest to it (ie to find the nearest neighbors) can be done with
//
// faiss::idx_t *centroid_ids = new faiss::idx_t[k];
// float *distances = new float[k];
// index.quantizer->search (1, x, k, dis, centroids_ids);
//
faiss::write_index(&index, "/tmp/populated_index.faissindex");
{ // searching the database
int k = 5;
printf("[%.3f s] Searching the %d nearest neighbors "
"of %ld vectors in the index\n",
elapsed() - t0,
k,
nq);
std::vector<faiss::idx_t> nns(k * nq);
std::vector<float> dis(k * nq);
index.search(nq, queries.data(), k, dis.data(), nns.data());
printf("[%.3f s] Query results (vector ids, then distances):\n",
elapsed() - t0);
for (int i = 0; i < nq; i++) {
printf("query %2d: ", i);
for (int j = 0; j < k; j++) {
printf("%7ld ", nns[j + i * k]);
}
printf("\n dis: ");
for (int j = 0; j < k; j++) {
printf("%7g ", dis[j + i * k]);
}
printf("\n");
}
}
return 0;
}

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <random>
#include <sys/time.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFPQ.h>
#include <faiss/index_io.h>
double elapsed() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
int main() {
double t0 = elapsed();
// dimension of the vectors to index
int d = 128;
// size of the database we plan to index
size_t nb = 200 * 1000;
// make a set of nt training vectors in the unit cube
// (could be the database)
size_t nt = 100 * 1000;
// make the index object and train it
faiss::IndexFlatL2 coarse_quantizer(d);
// a reasonable number of centroids to index nb vectors
int ncentroids = int(4 * sqrt(nb));
// the coarse quantizer should not be dealloced before the index
// 4 = nb of bytes per code (d must be a multiple of this)
// 8 = nb of bits per sub-code (almost always 8)
faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 4, 8);
std::mt19937 rng;
{ // training
printf("[%.3f s] Generating %ld vectors in %dD for training\n",
elapsed() - t0,
nt,
d);
std::vector<float> trainvecs(nt * d);
std::uniform_real_distribution<> distrib;
for (size_t i = 0; i < nt * d; i++) {
trainvecs[i] = distrib(rng);
}
printf("[%.3f s] Training the index\n", elapsed() - t0);
index.verbose = true;
index.train(nt, trainvecs.data());
}
{ // I/O demo
const char* outfilename = "/tmp/index_trained.faissindex";
printf("[%.3f s] storing the pre-trained index to %s\n",
elapsed() - t0,
outfilename);
write_index(&index, outfilename);
}
size_t nq;
std::vector<float> queries;
{ // populating the database
printf("[%.3f s] Building a dataset of %ld vectors to index\n",
elapsed() - t0,
nb);
std::vector<float> database(nb * d);
std::uniform_real_distribution<> distrib;
for (size_t i = 0; i < nb * d; i++) {
database[i] = distrib(rng);
}
printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
index.add(nb, database.data());
printf("[%.3f s] imbalance factor: %g\n",
elapsed() - t0,
index.invlists->imbalance_factor());
// remember a few elements from the database as queries
int i0 = 1234;
int i1 = 1243;
nq = i1 - i0;
queries.resize(nq * d);
for (int i = i0; i < i1; i++) {
for (int j = 0; j < d; j++) {
queries[(i - i0) * d + j] = database[i * d + j];
}
}
}
{ // searching the database
int k = 5;
printf("[%.3f s] Searching the %d nearest neighbors "
"of %ld vectors in the index\n",
elapsed() - t0,
k,
nq);
std::vector<faiss::idx_t> nns(k * nq);
std::vector<float> dis(k * nq);
index.search(nq, queries.data(), k, dis.data(), nns.data());
printf("[%.3f s] Query results (vector ids, then distances):\n",
elapsed() - t0);
for (int i = 0; i < nq; i++) {
printf("query %2d: ", i);
for (int j = 0; j < k; j++) {
printf("%7ld ", nns[j + i * k]);
}
printf("\n dis: ");
for (int j = 0; j < k; j++) {
printf("%7g ", dis[j + i * k]);
}
printf("\n");
}
printf("note that the nearest neighbor is not at "
"distance 0 due to quantization errors\n");
}
return 0;
}

View File

@@ -0,0 +1,88 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <random>
#include <faiss/IndexFlat.h>
#include <faiss/IndexNNDescent.h>
using namespace std::chrono;
int main(void) {
// dimension of the vectors to index
int d = 64;
int K = 64;
// size of the database we plan to index
size_t nb = 10000;
std::mt19937 rng(12345);
// make the index object and train it
faiss::IndexNNDescentFlat index(d, K, faiss::METRIC_L2);
index.nndescent.S = 10;
index.nndescent.R = 32;
index.nndescent.L = K;
index.nndescent.iter = 10;
index.verbose = true;
// generate labels by IndexFlat
faiss::IndexFlat bruteforce(d, faiss::METRIC_L2);
std::vector<float> database(nb * d);
for (size_t i = 0; i < nb * d; i++) {
database[i] = rng() % 1024;
}
{ // populating the database
index.add(nb, database.data());
bruteforce.add(nb, database.data());
}
size_t nq = 1000;
{ // searching the database
printf("Searching ...\n");
index.nndescent.search_L = 50;
std::vector<float> queries(nq * d);
for (size_t i = 0; i < nq * d; i++) {
queries[i] = rng() % 1024;
}
int k = 5;
std::vector<faiss::idx_t> nns(k * nq);
std::vector<faiss::idx_t> gt_nns(k * nq);
std::vector<float> dis(k * nq);
auto start = high_resolution_clock::now();
index.search(nq, queries.data(), k, dis.data(), nns.data());
auto end = high_resolution_clock::now();
// find exact kNNs by brute force search
bruteforce.search(nq, queries.data(), k, dis.data(), gt_nns.data());
int recalls = 0;
for (size_t i = 0; i < nq; ++i) {
for (int n = 0; n < k; n++) {
for (int m = 0; m < k; m++) {
if (nns[i * k + n] == gt_nns[i * k + m]) {
recalls += 1;
}
}
}
}
float recall = 1.0f * recalls / (k * nq);
auto t = duration_cast<microseconds>(end - start).count();
int qps = nq * 1.0f * 1000 * 1000 / t;
printf("Recall@%d: %f, QPS: %d\n", k, recall, qps);
}
}

View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
import numpy as np
import faiss
from faiss.contrib.ondisk import merge_ondisk
#################################################################
# Small I/O functions
#################################################################
def ivecs_read(fname):
a = np.fromfile(fname, dtype='int32')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
#################################################################
# Main program
#################################################################
stage = int(sys.argv[1])
tmpdir = '/tmp/'
if stage == 0:
# train the index
xt = fvecs_read("sift1M/sift_learn.fvecs")
index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
print("training index")
index.train(xt)
print("write " + tmpdir + "trained.index")
faiss.write_index(index, tmpdir + "trained.index")
if 1 <= stage <= 4:
# add 1/4 of the database to 4 independent indexes
bno = stage - 1
xb = fvecs_read("sift1M/sift_base.fvecs")
i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
index = faiss.read_index(tmpdir + "trained.index")
print("adding vectors %d:%d" % (i0, i1))
index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
print("write " + tmpdir + "block_%d.index" % bno)
faiss.write_index(index, tmpdir + "block_%d.index" % bno)
if stage == 5:
print('loading trained index')
# construct the output index
index = faiss.read_index(tmpdir + "trained.index")
block_fnames = [
tmpdir + "block_%d.index" % bno
for bno in range(4)
]
merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata")
print("write " + tmpdir + "populated.index")
faiss.write_index(index, tmpdir + "populated.index")
if stage == 6:
# perform a search from disk
print("read " + tmpdir + "populated.index")
index = faiss.read_index(tmpdir + "populated.index")
index.nprobe = 16
# load query vectors and ground-truth
xq = fvecs_read("sift1M/sift_query.fvecs")
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
D, I = index.search(xq, 5)
recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
print("recall@1: %.3f" % recall_at_1)

View File

@@ -0,0 +1,77 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
This demonstrates how to reproduce the QINCo paper results using the Faiss
QINCo implementation. The code loads the reference model because training
is not implemented in Faiss.
Prepare the data with
cd /tmp
# get the reference qinco code
git clone https://github.com/facebookresearch/Qinco.git
# get the data
wget https://dl.fbaipublicfiles.com/QINCo/datasets/bigann/bigann1M.bvecs
# get the model
wget https://dl.fbaipublicfiles.com/QINCo/models/bigann_8x8_L2.pt
"""
import numpy as np
from faiss.contrib.vecs_io import bvecs_mmap
import sys
import time
import torch
import faiss
# make sure pickle deserialization will work
sys.path.append("/tmp/Qinco")
import model_qinco
with torch.no_grad():
qinco = torch.load("/tmp/bigann_8x8_L2.pt", weights_only=False)
qinco.eval()
# print(qinco)
if True:
torch.set_num_threads(1)
faiss.omp_set_num_threads(1)
x_base = bvecs_mmap("/tmp/bigann1M.bvecs")[:1000].astype('float32')
x_scaled = torch.from_numpy(x_base) / qinco.db_scale
t0 = time.time()
codes, _ = qinco.encode(x_scaled)
x_decoded_scaled = qinco.decode(codes)
print(f"Pytorch encode {time.time() - t0:.3f} s")
# multi-thread: 1.13s, single-thread: 7.744
x_decoded = x_decoded_scaled.numpy() * qinco.db_scale
err = ((x_decoded - x_base) ** 2).sum(1).mean()
print("MSE=", err) # = 14211.956, near the L=2 result in Fig 4 of the paper
qinco2 = faiss.QINCo(qinco)
t0 = time.time()
codes2 = qinco2.encode(faiss.Tensor2D(x_scaled))
x_decoded2 = qinco2.decode(codes2).numpy() * qinco.db_scale
print(f"Faiss encode {time.time() - t0:.3f} s")
# multi-thread: 3.2s, single thread: 7.019
# these tests don't work because there are outlier encodings
# np.testing.assert_array_equal(codes.numpy(), codes2.numpy())
# np.testing.assert_allclose(x_decoded, x_decoded2)
ndiff = (codes.numpy() != codes2.numpy()).sum() / codes.numel()
assert ndiff < 0.01
ndiff = (((x_decoded - x_decoded2) ** 2).sum(1) > 1e-5).sum()
assert ndiff / len(x_base) < 0.01
err = ((x_decoded2 - x_base) ** 2).sum(1).mean()
print("MSE=", err) # = 14213.551

View File

@@ -0,0 +1,297 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <climits>
#include <cstdio>
#include <memory>
#include <faiss/IVFlib.h>
#include <faiss/IndexAdditiveQuantizer.h>
#include <faiss/IndexIVFAdditiveQuantizer.h>
#include <faiss/MetricType.h>
#include <faiss/utils/distances.h>
#include <faiss/utils/hamming.h>
#include <faiss/utils/random.h>
#include <faiss/utils/utils.h>
/* This demo file shows how to:
* - use a DistanceComputer to compute distances with encoded vectors
* - in the context of an IVF, how to split an additive quantizer into an
* AdditiveCoarseQuantizer and a ResidualQuantizer, in two different ways, with
* and without storing the prefix.
*/
int main() {
/******************************************
* Generate a test dataset
******************************************/
using idx_t = faiss::idx_t;
size_t d = 128;
size_t nt = 10000;
size_t nb = 10000;
size_t nq = 100;
double t0 = faiss::getmillisecs();
auto tic = [t0]() {
printf("[%.3f s] ", (faiss::getmillisecs() - t0) / 1000);
};
tic();
printf("samping dataset of %zd dim vectors, Q %zd B %zd T %zd\n",
d,
nq,
nb,
nt);
std::vector<float> buf(d * (nq + nt + nb));
faiss::rand_smooth_vectors(nq + nt + nb, d, buf.data(), 1234);
const float* xt = buf.data();
const float* xb = buf.data() + nt * d;
const float* xq = buf.data() + (nt + nb) * d;
idx_t k = 10;
std::vector<idx_t> gt(k * nq);
std::vector<float> unused(k * nq);
tic();
printf("compute ground truth, k=%zd\n", k);
faiss::knn_L2sqr(xq, xb, d, nq, nb, k, unused.data(), gt.data());
// a function to compute the accuracy
auto accuracy = [&](const idx_t* I) {
idx_t accu = 0;
for (idx_t q = 0; q < nq; q++) {
accu += faiss::ranklist_intersection_size(
k, gt.data() + q * k, k, I + q * k);
}
return double(accu) / (k * nq);
};
/******************************************
* Prepare the residual quantizer
******************************************/
faiss::ResidualQuantizer rq(
d, 7, 6, faiss::AdditiveQuantizer::ST_norm_qint8);
// do cheap an inaccurate training
rq.cp.niter = 5;
rq.max_beam_size = 5;
rq.train_type = 0;
tic();
printf("training the residual quantizer beam_size=%d\n", rq.max_beam_size);
rq.train(nt, xt);
tic();
printf("encoding the database, code_size=%zd\n", rq.code_size);
size_t code_size = rq.code_size;
std::vector<uint8_t> raw_codes(nb * code_size);
rq.compute_codes(xb, raw_codes.data(), nb);
/****************************************************************
* Make an index that uses that residual quantizer
* Verify that a distance computer gives the same distances
****************************************************************/
{
faiss::IndexResidualQuantizer index(
rq.d, rq.nbits, faiss::METRIC_L2, rq.search_type);
// override trained index
index.rq = rq;
index.is_trained = true;
// override vectors
index.codes = faiss::MaybeOwnedVector<uint8_t>(raw_codes);
index.ntotal = nb;
tic();
printf("IndexResidualQuantizer ready, searching\n");
std::vector<float> D(k * nq);
std::vector<idx_t> I(k * nq);
index.search(nq, xq, k, D.data(), I.data());
tic();
printf("Accuracy (intersection @ %zd): %.3f\n", k, accuracy(I.data()));
std::unique_ptr<faiss::FlatCodesDistanceComputer> dc(
index.get_FlatCodesDistanceComputer());
float max_diff12 = 0, max_diff13 = 0;
for (idx_t q = 0; q < nq; q++) {
const float* query = xq + q * d;
dc->set_query(query);
for (int i = 0; i < k; i++) {
// 3 ways of computing the same distance
// distance returned by the index
float dis1 = D[q * k + i];
// distance returned by the DistanceComputer that accesses the
// index
idx_t db_index = I[q * k + i];
float dis2 = (*dc)(db_index);
// distance computer from a code that does not belong to the
// index
const uint8_t* code = raw_codes.data() + code_size * db_index;
float dis3 = dc->distance_to_code(code);
max_diff12 = std::max(std::abs(dis1 - dis2), max_diff12);
max_diff13 = std::max(std::abs(dis1 - dis3), max_diff13);
}
}
tic();
printf("Max DistanceComputer discrepancy 1-2: %g 1-3: %g\n",
max_diff12,
max_diff13);
}
/****************************************************************
* Make an IVF index that uses the first 2 levels as a coarse quantizer
* The IVF codes contain the full code (ie. redundant with the coarse
*quantizer code)
****************************************************************/
{
// build a coarse quantizer from the 2 first levels of the RQ
std::vector<size_t> nbits(2);
std::copy(rq.nbits.begin(), rq.nbits.begin() + 2, nbits.begin());
faiss::ResidualCoarseQuantizer rcq(rq.d, nbits);
// set the coarse quantizer from the 2 first quantizers
rcq.rq.initialize_from(rq);
rcq.is_trained = true;
rcq.ntotal = (idx_t)1 << rcq.rq.tot_bits;
// settings for exhaustive search in RCQ
rcq.centroid_norms.resize(rcq.ntotal);
rcq.aq->compute_centroid_norms(rcq.centroid_norms.data());
rcq.beam_factor = -1.0; // use exact search
size_t nlist = rcq.ntotal;
tic();
printf("RCQ nlist = %zd tot_bits=%zd\n", nlist, rcq.rq.tot_bits);
// build a IVFResidualQuantizer from that
faiss::IndexIVFResidualQuantizer index(
&rcq, rcq.d, nlist, rq.nbits, faiss::METRIC_L2, rq.search_type);
index.by_residual = false;
index.rq = rq;
index.is_trained = true;
// there are 3 ways of filling up the index...
for (std::string filled_with : {"add", "manual", "derived"}) {
tic();
printf("filling up the index with %s, code_size=%zd\n",
filled_with.c_str(),
index.code_size);
index.reset();
if (filled_with == "add") {
// standard add method
index.add(nb, xb);
} else if (filled_with == "manual") {
// compute inverted lists and add elements manually
// fill in the inverted index manually
faiss::InvertedLists& invlists = *index.invlists;
// assign vectors to inverted lists
std::vector<idx_t> listnos(nb);
std::vector<float> unused(nb);
rcq.search(nb, xb, 1, unused.data(), listnos.data());
// populate inverted lists
for (idx_t i = 0; i < nb; i++) {
invlists.add_entry(
listnos[i], i, &raw_codes[i * code_size]);
}
index.ntotal = nb;
} else if (filled_with == "derived") {
// Since we have the raw codes precomputed, their prefix is the
// inverted list index, so let's use that.
faiss::InvertedLists& invlists = *index.invlists;
// populate inverted lists
for (idx_t i = 0; i < nb; i++) {
const uint8_t* code = &raw_codes[i * code_size];
faiss::BitstringReader rd(code, code_size);
idx_t list_no =
rd.read(rcq.rq.tot_bits); // read the list number
invlists.add_entry(list_no, i, code);
}
index.ntotal = nb;
}
tic();
printf("Index filled in\n");
for (int nprobe : {1, 4, 16, 64, int(nlist)}) {
printf("setting nprobe=%-4d", nprobe);
index.nprobe = nprobe;
std::vector<float> D(k * nq);
std::vector<idx_t> I(k * nq);
index.search(nq, xq, k, D.data(), I.data());
tic();
printf("Accuracy (intersection @ %zd): %.3f\n",
k,
accuracy(I.data()));
}
}
}
/****************************************************************
* Make an IVF index that uses the first 2 levels as a coarse
* quantizer, but this time does not store the code prefix from the index
****************************************************************/
{
// build a coarse quantizer from the 2 first levels of the RQ
int nlevel = 2;
std::unique_ptr<faiss::IndexIVFResidualQuantizer> index(
faiss::ivflib::ivf_residual_from_quantizer(rq, nlevel));
// there are 2 ways of filling up the index...
for (std::string filled_with : {"add", "derived"}) {
tic();
printf("filling up the IVF index with %s, code_size=%zd\n",
filled_with.c_str(),
index->code_size);
index->reset();
if (filled_with == "add") {
// standard add method
index->add(nb, xb);
} else if (filled_with == "derived") {
faiss::ivflib::ivf_residual_add_from_flat_codes(
index.get(), nb, raw_codes.data(), rq.code_size);
}
tic();
printf("Index filled in\n");
for (int nprobe : {1, 4, 16, 64, int(index->nlist)}) {
printf("setting nprobe=%-4d", nprobe);
index->nprobe = nprobe;
std::vector<float> D(k * nq);
std::vector<idx_t> I(k * nq);
index->search(nq, xq, k, D.data(), I.data());
tic();
printf("Accuracy (intersection @ %zd): %.3f\n",
k,
accuracy(I.data()));
}
}
}
return 0;
}

View File

@@ -0,0 +1,254 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sys/stat.h>
#include <sys/time.h>
#include <faiss/AutoTune.h>
#include <faiss/index_factory.h>
/**
* To run this demo, please download the ANN_SIFT1M dataset from
*
* http://corpus-texmex.irisa.fr/
*
* and unzip it to the sudirectory sift1M.
**/
/*****************************************************
* I/O functions for fvecs and ivecs
*****************************************************/
float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
FILE* f = fopen(fname, "r");
if (!f) {
fprintf(stderr, "could not open %s\n", fname);
perror("");
abort();
}
int d;
fread(&d, 1, sizeof(int), f);
assert((d > 0 && d < 1000000) || !"unreasonable dimension");
fseek(f, 0, SEEK_SET);
struct stat st;
fstat(fileno(f), &st);
size_t sz = st.st_size;
assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
size_t n = sz / ((d + 1) * 4);
*d_out = d;
*n_out = n;
float* x = new float[n * (d + 1)];
size_t nr __attribute__((unused)) = fread(x, sizeof(float), n * (d + 1), f);
assert(nr == n * (d + 1) || !"could not read whole file");
// shift array to remove row headers
for (size_t i = 0; i < n; i++)
memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
fclose(f);
return x;
}
// not very clean, but works as long as sizeof(int) == sizeof(float)
int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
return (int*)fvecs_read(fname, d_out, n_out);
}
double elapsed() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
int main() {
double t0 = elapsed();
// this is typically the fastest one.
const char* index_key = "IVF4096,Flat";
// these ones have better memory usage
// const char *index_key = "Flat";
// const char *index_key = "PQ32";
// const char *index_key = "PCA80,Flat";
// const char *index_key = "IVF4096,PQ8+16";
// const char *index_key = "IVF4096,PQ32";
// const char *index_key = "IMI2x8,PQ32";
// const char *index_key = "IMI2x8,PQ8+16";
// const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
faiss::Index* index;
size_t d;
{
printf("[%.3f s] Loading train set\n", elapsed() - t0);
size_t nt;
float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
printf("[%.3f s] Preparing index \"%s\" d=%ld\n",
elapsed() - t0,
index_key,
d);
index = faiss::index_factory(d, index_key);
printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
index->train(nt, xt);
delete[] xt;
}
{
printf("[%.3f s] Loading database\n", elapsed() - t0);
size_t nb, d2;
float* xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
assert(d == d2 || !"dataset does not have same dimension as train set");
printf("[%.3f s] Indexing database, size %ld*%ld\n",
elapsed() - t0,
nb,
d);
index->add(nb, xb);
delete[] xb;
}
size_t nq;
float* xq;
{
printf("[%.3f s] Loading queries\n", elapsed() - t0);
size_t d2;
xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
assert(d == d2 || !"query does not have same dimension as train set");
}
size_t k; // nb of results per query in the GT
faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
{
printf("[%.3f s] Loading ground truth for %ld queries\n",
elapsed() - t0,
nq);
// load ground-truth and convert int to long
size_t nq2;
int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
assert(nq2 == nq || !"incorrect nb of ground truth entries");
gt = new faiss::idx_t[k * nq];
for (int i = 0; i < k * nq; i++) {
gt[i] = gt_int[i];
}
delete[] gt_int;
}
// Result of the auto-tuning
std::string selected_params;
{ // run auto-tuning
printf("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
"criterion, with k=%ld nq=%ld\n",
elapsed() - t0,
k,
nq);
faiss::OneRecallAtRCriterion crit(nq, 1);
crit.set_groundtruth(k, nullptr, gt);
crit.nnn = k; // by default, the criterion will request only 1 NN
printf("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
faiss::ParameterSpace params;
params.initialize(index);
printf("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
elapsed() - t0,
params.parameter_ranges.size(),
params.n_combinations());
faiss::OperatingPoints ops;
params.explore(index, nq, xq, crit, &ops);
printf("[%.3f s] Found the following operating points: \n",
elapsed() - t0);
ops.display();
// keep the first parameter that obtains > 0.5 1-recall@1
for (int i = 0; i < ops.optimal_pts.size(); i++) {
if (ops.optimal_pts[i].perf > 0.5) {
selected_params = ops.optimal_pts[i].key;
break;
}
}
assert(selected_params.size() >= 0 ||
!"could not find good enough op point");
}
{ // Use the found configuration to perform a search
faiss::ParameterSpace params;
printf("[%.3f s] Setting parameter configuration \"%s\" on index\n",
elapsed() - t0,
selected_params.c_str());
params.set_index_parameters(index, selected_params.c_str());
printf("[%.3f s] Perform a search on %ld queries\n",
elapsed() - t0,
nq);
// output buffers
faiss::idx_t* I = new faiss::idx_t[nq * k];
float* D = new float[nq * k];
index->search(nq, xq, k, D, I);
printf("[%.3f s] Compute recalls\n", elapsed() - t0);
// evaluate result by hand.
int n_1 = 0, n_10 = 0, n_100 = 0;
for (int i = 0; i < nq; i++) {
int gt_nn = gt[i * k];
for (int j = 0; j < k; j++) {
if (I[i * k + j] == gt_nn) {
if (j < 1)
n_1++;
if (j < 10)
n_10++;
if (j < 100)
n_100++;
}
}
}
printf("R@1 = %.4f\n", n_1 / float(nq));
printf("R@10 = %.4f\n", n_10 / float(nq));
printf("R@100 = %.4f\n", n_100 / float(nq));
delete[] I;
delete[] D;
}
delete[] xq;
delete[] gt;
delete index;
return 0;
}

View File

@@ -0,0 +1,181 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cstdio>
#include <cstdlib>
#include <memory>
#include <faiss/Clustering.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexHNSW.h>
#include <faiss/utils/distances.h>
#include <faiss/utils/random.h>
namespace {
enum WeightedKMeansType {
WKMT_FlatL2,
WKMT_FlatIP,
WKMT_FlatIP_spherical,
WKMT_HNSW,
};
float weighted_kmeans_clustering(
size_t d,
size_t n,
size_t k,
const float* input,
const float* weights,
float* centroids,
WeightedKMeansType index_num) {
using namespace faiss;
Clustering clus(d, k);
clus.verbose = true;
std::unique_ptr<Index> index;
switch (index_num) {
case WKMT_FlatL2:
index = std::make_unique<IndexFlatL2>(d);
break;
case WKMT_FlatIP:
index = std::make_unique<IndexFlatIP>(d);
break;
case WKMT_FlatIP_spherical:
index = std::make_unique<IndexFlatIP>(d);
clus.spherical = true;
break;
case WKMT_HNSW:
IndexHNSWFlat* ihnsw = new IndexHNSWFlat(d, 32);
ihnsw->hnsw.efSearch = 128;
index.reset(ihnsw);
break;
}
clus.train(n, input, *index.get(), weights);
// on output the index contains the centroids.
memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
return clus.iteration_stats.back().obj;
}
int d = 32;
float sigma = 0.1;
#define BIGTEST
#ifdef BIGTEST
// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
int nc = 200000;
int n_big = 4;
int n_small = 2;
#else
int nc = 5;
int n_big = 100;
int n_small = 10;
#endif
int n; // number of training points
void generate_trainset(
std::vector<float>& ccent,
std::vector<float>& x,
std::vector<float>& weights) {
// same sampling as test_build_blocks.py test_weighted
ccent.resize(d * 2 * nc);
faiss::float_randn(ccent.data(), d * 2 * nc, 123);
faiss::fvec_renorm_L2(d, 2 * nc, ccent.data());
n = nc * n_big + nc * n_small;
x.resize(d * n);
weights.resize(n);
faiss::float_randn(x.data(), x.size(), 1234);
float* xi = x.data();
float* w = weights.data();
for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
int np = ci < nc ? n_big : n_small; // nb of points around this centroid
for (int i = 0; i < np; i++) {
for (int j = 0; j < d; j++) {
xi[j] = xi[j] * sigma + ccent[ci * d + j];
}
*w++ = ci < nc ? 0.1 : 10;
xi += d;
}
}
}
} // namespace
int main(int argc, char** argv) {
std::vector<float> ccent;
std::vector<float> x;
std::vector<float> weights;
printf("generate training set\n");
generate_trainset(ccent, x, weights);
std::vector<float> centroids;
centroids.resize(nc * d);
int the_index_num = -1;
int the_with_weights = -1;
if (argc == 3) {
the_index_num = atoi(argv[1]);
the_with_weights = atoi(argv[2]);
}
for (int index_num = WKMT_FlatL2; index_num <= WKMT_HNSW; index_num++) {
if (the_index_num >= 0 && index_num != the_index_num) {
continue;
}
for (int with_weights = 0; with_weights <= 1; with_weights++) {
if (the_with_weights >= 0 && with_weights != the_with_weights) {
continue;
}
printf("=================== index_num=%d Run %s weights\n",
index_num,
with_weights ? "with" : "without");
weighted_kmeans_clustering(
d,
n,
nc,
x.data(),
with_weights ? weights.data() : nullptr,
centroids.data(),
(WeightedKMeansType)index_num);
{ // compute distance of points to centroids
faiss::IndexFlatL2 cent_index(d);
cent_index.add(nc, centroids.data());
std::vector<float> dis(n);
std::vector<faiss::idx_t> idx(n);
cent_index.search(
nc * 2, ccent.data(), 1, dis.data(), idx.data());
float dis1 = 0, dis2 = 0;
for (int i = 0; i < nc; i++) {
dis1 += dis[i];
}
printf("average distance of points from big clusters: %g\n",
dis1 / nc);
for (int i = 0; i < nc; i++) {
dis2 += dis[i + nc];
}
printf("average distance of points from small clusters: %g\n",
dis2 / nc);
}
}
}
return 0;
}

View File

@@ -0,0 +1,303 @@
#!/usr/bin/env -S grimaldi --kernel bento_kernel_faiss
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# fmt: off
# flake8: noqa
""":md
# Serializing codes separately, with IndexLSH and IndexPQ
Let's say, for example, you have a few vector embeddings per user
and want to shard a flat index by user so you can re-use the same LSH or PQ method
for all users but store each user's codes independently.
"""
""":py"""
import faiss
import numpy as np
""":py"""
d = 768
n = 1_000
ids = np.arange(n).astype('int64')
training_data = np.random.rand(n, d).astype('float32')
""":py"""
def read_ids_codes():
try:
return np.load("/tmp/ids.npy"), np.load("/tmp/codes.npy")
except FileNotFoundError:
return None, None
def write_ids_codes(ids, codes):
np.save("/tmp/ids.npy", ids)
np.save("/tmp/codes.npy", codes.reshape(len(ids), -1))
def write_template_index(template_index):
faiss.write_index(template_index, "/tmp/template.index")
def read_template_index_instance():
return faiss.read_index("/tmp/template.index")
""":md
## IndexLSH: separate codes
The first half of this notebook demonstrates how to store LSH codes. Unlike PQ, LSH does not require training. In fact, it's compression method, a random projections matrix, is deterministic on construction based on a random seed value that's [hardcoded](https://github.com/facebookresearch/faiss/blob/2c961cc308ade8a85b3aa10a550728ce3387f625/faiss/IndexLSH.cpp#L35).
"""
""":py"""
nbits = 1536
""":py"""
# demonstrating encoding is deterministic
codes = []
database_vector_float32 = np.random.rand(1, d).astype(np.float32)
for i in range(10):
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
code = index.index.sa_encode(database_vector_float32)
codes.append(code)
for i in range(1, 10):
assert np.array_equal(codes[0], codes[i])
""":py"""
# new database vector
ids, codes = read_ids_codes()
database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
code = index.index.sa_encode(database_vector_float32)
if ids is not None and codes is not None:
ids = np.concatenate((ids, [database_vector_id]))
codes = np.vstack((codes, code))
else:
ids = np.array([database_vector_id])
codes = np.array([code])
write_ids_codes(ids, codes)
""":py '2840581589434841'"""
# then at query time
query_vector_float32 = np.random.rand(1, d).astype(np.float32)
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
ids, codes = read_ids_codes()
index.add_sa_codes(codes, ids)
index.search(query_vector_float32, k=5)
""":py"""
!rm /tmp/ids.npy /tmp/codes.npy
""":md
## IndexPQ: separate codes from codebook
The second half of this notebook demonstrates how to separate serializing and deserializing the PQ codebook
(via faiss.write_index for IndexPQ) independently of the vector codes. For example, in the case
where you have a few vector embeddings per user and want to shard the flat index by user you
can re-use the same PQ method for all users but store each user's codes independently.
"""
""":py"""
M = d//8
nbits = 8
""":py"""
# at train time
template_index = faiss.index_factory(d, f"IDMap2,PQ{M}x{nbits}")
template_index.train(training_data)
write_template_index(template_index)
""":py"""
# New database vector
index = read_template_index_instance()
ids, codes = read_ids_codes()
database_vector_id, database_vector_float32 = max(ids) + 1 if ids is not None else 1, np.random.rand(1, d).astype(np.float32)
code = index.index.sa_encode(database_vector_float32)
if ids is not None and codes is not None:
ids = np.concatenate((ids, [database_vector_id]))
codes = np.vstack((codes, code))
else:
ids = np.array([database_vector_id])
codes = np.array([code])
write_ids_codes(ids, codes)
""":py '1858280061369209'"""
# then at query time
query_vector_float32 = np.random.rand(1, d).astype(np.float32)
id_wrapper_index = read_template_index_instance()
ids, codes = read_ids_codes()
id_wrapper_index.add_sa_codes(codes, ids)
id_wrapper_index.search(query_vector_float32, k=5)
""":py"""
!rm /tmp/ids.npy /tmp/codes.npy /tmp/template.index
""":md
## Comparing these methods
- methods: Flat, LSH, PQ
- vary cost: nbits, M for 1x, 2x, 4x, 8x, 16x, 32x compression
- measure: recall@1
We don't measure latency as the number of vectors per user shard is insignificant.
"""
""":py '2898032417027201'"""
n, d
""":py"""
database_vector_ids, database_vector_float32s = np.arange(n), np.random.rand(n, d).astype(np.float32)
query_vector_float32s = np.random.rand(n, d).astype(np.float32)
""":py"""
index = faiss.index_factory(d, "IDMap2,Flat")
index.add_with_ids(database_vector_float32s, database_vector_ids)
_, ground_truth_result_ids= index.search(query_vector_float32s, k=1)
""":py '857475336204238'"""
from dataclasses import dataclass
pq_m_nbits = (
# 96 bytes
(96, 8),
(192, 4),
# 192 bytes
(192, 8),
(384, 4),
# 384 bytes
(384, 8),
(768, 4),
)
lsh_nbits = (768, 1536, 3072, 6144, 12288, 24576)
@dataclass
class Record:
type_: str
index: faiss.Index
args: tuple
recall: float
results = []
for m, nbits in pq_m_nbits:
print("pq", m, nbits)
index = faiss.index_factory(d, f"IDMap2,PQ{m}x{nbits}")
index.train(training_data)
index.add_with_ids(database_vector_float32s, database_vector_ids)
_, result_ids = index.search(query_vector_float32s, k=1)
recall = sum(result_ids == ground_truth_result_ids)
results.append(Record("pq", index, (m, nbits), recall))
for nbits in lsh_nbits:
print("lsh", nbits)
index = faiss.IndexIDMap2(faiss.IndexLSH(d, nbits))
index.add_with_ids(database_vector_float32s, database_vector_ids)
_, result_ids = index.search(query_vector_float32s, k=1)
recall = sum(result_ids == ground_truth_result_ids)
results.append(Record("lsh", index, (nbits,), recall))
""":py '556918346720794'"""
import matplotlib.pyplot as plt
import numpy as np
def create_grouped_bar_chart(x_values, y_values_list, labels_list, xlabel, ylabel, title):
num_bars_per_group = len(x_values)
plt.figure(figsize=(12, 6))
for x, y_values, labels in zip(x_values, y_values_list, labels_list):
num_bars = len(y_values)
bar_width = 0.08 * x
bar_positions = np.arange(num_bars) * bar_width - (num_bars - 1) * bar_width / 2 + x
bars = plt.bar(bar_positions, y_values, width=bar_width)
for bar, label in zip(bars, labels):
height = bar.get_height()
plt.annotate(
label,
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha='center', va='bottom'
)
plt.xscale('log')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.xticks(x_values, labels=[str(x) for x in x_values])
plt.tight_layout()
plt.show()
# # Example usage:
# x_values = [1, 2, 4, 8, 16, 32]
# y_values_list = [
# [2.5, 3.6, 1.8],
# [3.0, 2.8],
# [2.5, 3.5, 4.0, 1.0],
# [4.2],
# [3.0, 5.5, 2.2],
# [6.0, 4.5]
# ]
# labels_list = [
# ['A1', 'B1', 'C1'],
# ['A2', 'B2'],
# ['A3', 'B3', 'C3', 'D3'],
# ['A4'],
# ['A5', 'B5', 'C5'],
# ['A6', 'B6']
# ]
# create_grouped_bar_chart(x_values, y_values_list, labels_list, "x axis", "y axis", "title")
""":py '1630106834206134'"""
# x-axis: compression ratio
# y-axis: recall@1
from collections import defaultdict
x = defaultdict(list)
x[1].append(("flat", 1.00))
for r in results:
y_value = r.recall[0] / n
x_value = int(d * 4 / r.index.sa_code_size())
label = None
if r.type_ == "pq":
label = f"PQ{r.args[0]}x{r.args[1]}"
if r.type_ == "lsh":
label = f"LSH{r.args[0]}"
x[x_value].append((label, y_value))
x_values = sorted(list(x.keys()))
create_grouped_bar_chart(
x_values,
[[e[1] for e in x[x_value]] for x_value in x_values],
[[e[0] for e in x[x_value]] for x_value in x_values],
"compression ratio",
"recall@1 q=1,000 queries",
"recall@1 for a database of n=1,000 d=768 vectors",
)

View File

@@ -0,0 +1,52 @@
# Offline IVF
This folder contains the code for the offline ivf algorithm powered by faiss big batch search.
Create a conda env:
`conda create --name oivf python=3.10`
`conda activate oivf`
`conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4`
`conda install tqdm`
`conda install pyyaml`
`conda install -c conda-forge submitit`
## Run book
1. Optionally shard your dataset (see create_sharded_dataset.py) and create the corresponding yaml file `config_ssnpp.yaml`. You can use `generate_config.py` by specifying the root directory of your dataset and the files with the data shards
`python generate_config`
2. Run the train index command
`python run.py --command train_index --config config_ssnpp.yaml --xb ssnpp_1B`
3. Run the index-shard command so it produces sharded indexes, required for the search step
`python run.py --command index_shard --config config_ssnpp.yaml --xb ssnpp_1B`
6. Send jobs to the cluster to run search
`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --cluster_run --partition <PARTITION-NAME>`
Remarks about the `search` command: it is assumed that the database vectors are the query vectors when performing the search step.
a. If the query vectors are different than the database vectors, it should be passed in the xq argument
b. A new dataset needs to be prepared (step 1) before passing it to the query vectors argument `xq`
`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --xq <QUERIES_DATASET_NAME>`
6. We can always run the consistency-check for sanity checks!
`python run.py --command consistency_check--config config_ssnpp.yaml --xb ssnpp_1B`

View File

View File

@@ -0,0 +1,110 @@
d: 256
output: /checkpoint/marialomeli/offline_faiss/ssnpp
index:
prod:
- 'IVF8192,PQ128'
non-prod:
- 'IVF16384,PQ128'
- 'IVF32768,PQ128'
- 'OPQ64_128,IVF4096,PQ64'
nprobe:
prod:
- 512
non-prod:
- 256
- 128
- 1024
- 2048
- 4096
- 8192
k: 50
index_shard_size: 50000000
query_batch_size: 50000000
evaluation_sample: 10000
training_sample: 1572864
datasets:
ssnpp_1B:
root: /checkpoint/marialomeli/ssnpp_data
size: 1000000000
files:
- dtype: uint8
format: npy
name: ssnpp_0000000000.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000001.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000002.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000003.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000004.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000005.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000006.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000007.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000008.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000009.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000010.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000011.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000012.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000013.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000014.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000015.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000016.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000017.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000018.npy
size: 50000000
- dtype: uint8
format: npy
name: ssnpp_0000000019.npy
size: 50000000

View File

@@ -0,0 +1,64 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import argparse
import os
def xbin_mmap(fname, dtype, maxn=-1):
"""
Code from
https://github.com/harsha-simhadri/big-ann-benchmarks/blob/main/benchmark/dataset_io.py#L94
mmap the competition file format for a given type of items
"""
n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
if maxn > 0:
n = min(n, maxn)
return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
def main(args: argparse.Namespace):
ssnpp_data = xbin_mmap(fname=args.filepath, dtype="uint8")
num_batches = ssnpp_data.shape[0] // args.data_batch
assert (
ssnpp_data.shape[0] % args.data_batch == 0
), "num of embeddings per file should divide total num of embeddings"
for i in range(num_batches):
xb_batch = ssnpp_data[
i * args.data_batch:(i + 1) * args.data_batch, :
]
filename = args.output_dir + f"/ssnpp_{(i):010}.npy"
np.save(filename, xb_batch)
print(f"File {filename} is saved!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_batch",
dest="data_batch",
type=int,
default=50000000,
help="Number of embeddings per file, should be a divisor of 1B",
)
parser.add_argument(
"--filepath",
dest="filepath",
type=str,
default="/datasets01/big-ann-challenge-data/FB_ssnpp/FB_ssnpp_database.u8bin",
help="path of 1B ssnpp database vectors' original file",
)
parser.add_argument(
"--filepath",
dest="output_dir",
type=str,
default="/checkpoint/marialomeli/ssnpp_data",
help="path to put sharded files",
)
args = parser.parse_args()
main(args)

View File

@@ -0,0 +1,174 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import faiss
from typing import List
import random
import logging
from functools import lru_cache
def create_dataset_from_oivf_config(cfg, ds_name):
normalise = cfg["normalise"] if "normalise" in cfg else False
return MultiFileVectorDataset(
cfg["datasets"][ds_name]["root"],
[
FileDescriptor(
f["name"], f["format"], np.dtype(f["dtype"]), f["size"]
)
for f in cfg["datasets"][ds_name]["files"]
],
cfg["d"],
normalise,
cfg["datasets"][ds_name]["size"],
)
@lru_cache(maxsize=100)
def _memmap_vecs(
file_name: str, format: str, dtype: np.dtype, size: int, d: int
) -> np.array:
"""
If the file is in raw format, the file size will
be divisible by the dimensionality and by the size
of the data type.
Otherwise,the file contains a header and we assume
it is of .npy type. It the returns the memmapped file.
"""
assert os.path.exists(file_name), f"file does not exist {file_name}"
if format == "raw":
fl = os.path.getsize(file_name)
nb = fl // d // dtype.itemsize
assert nb == size, f"{nb} is different than config's {size}"
assert fl == d * dtype.itemsize * nb # no header
return np.memmap(file_name, shape=(nb, d), dtype=dtype, mode="r")
elif format == "npy":
vecs = np.load(file_name, mmap_mode="r")
assert vecs.shape[0] == size, f"size:{size},shape {vecs.shape[0]}"
assert vecs.shape[1] == d
assert vecs.dtype == dtype
return vecs
else:
ValueError("The file cannot be loaded in the current format.")
class FileDescriptor:
def __init__(self, name: str, format: str, dtype: np.dtype, size: int):
self.name = name
self.format = format
self.dtype = dtype
self.size = size
class MultiFileVectorDataset:
def __init__(
self,
root: str,
file_descriptors: List[FileDescriptor],
d: int,
normalize: bool,
size: int,
):
assert os.path.exists(root)
self.root = root
self.file_descriptors = file_descriptors
self.d = d
self.normalize = normalize
self.size = size
self.file_offsets = [0]
t = 0
for f in self.file_descriptors:
xb = _memmap_vecs(
f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
)
t += xb.shape[0]
self.file_offsets.append(t)
assert (
t == self.size
), "the sum of num of embeddings per file!=total num of embeddings"
def iterate(self, start: int, batch_size: int, dt: np.dtype):
buffer = np.empty(shape=(batch_size, self.d), dtype=dt)
rem = 0
for f in self.file_descriptors:
if start >= f.size:
start -= f.size
continue
logging.info(f"processing: {f.name}...")
xb = _memmap_vecs(
f"{self.root}/{f.name}",
f.format,
f.dtype,
f.size,
self.d,
)
if start > 0:
xb = xb[start:]
start = 0
req = min(batch_size - rem, xb.shape[0])
buffer[rem:rem + req] = xb[:req]
rem += req
if rem == batch_size:
if self.normalize:
faiss.normalize_L2(buffer)
yield buffer.copy()
rem = 0
for i in range(req, xb.shape[0], batch_size):
j = i + batch_size
if j <= xb.shape[0]:
tmp = xb[i:j].astype(dt)
if self.normalize:
faiss.normalize_L2(tmp)
yield tmp
else:
rem = xb.shape[0] - i
buffer[:rem] = xb[i:j]
if rem > 0:
tmp = buffer[:rem]
if self.normalize:
faiss.normalize_L2(tmp)
yield tmp
def get(self, idx: List[int]):
n = len(idx)
fidx = np.searchsorted(self.file_offsets, idx, "right")
res = np.empty(shape=(len(idx), self.d), dtype=np.float32)
for r, id, fid in zip(range(n), idx, fidx):
assert fid > 0 and fid <= len(self.file_descriptors), f"{fid}"
f = self.file_descriptors[fid - 1]
# deferring normalization until after reading the vec
vecs = _memmap_vecs(
f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
)
i = id - self.file_offsets[fid - 1]
assert i >= 0 and i < vecs.shape[0]
res[r, :] = vecs[i] # TODO: find a faster way
if self.normalize:
faiss.normalize_L2(res)
return res
def sample(self, n, idx_fn, vecs_fn):
if vecs_fn and os.path.exists(vecs_fn):
vecs = np.load(vecs_fn)
assert vecs.shape == (n, self.d)
return vecs
if idx_fn and os.path.exists(idx_fn):
idx = np.load(idx_fn)
assert idx.size == n
else:
idx = np.array(sorted(random.sample(range(self.size), n)))
if idx_fn:
np.save(idx_fn, idx)
vecs = self.get(idx)
if vecs_fn:
np.save(vecs_fn, vecs)
return vecs
def get_first_n(self, n, dt):
assert n <= self.size
return next(self.iterate(0, n, dt))

View File

@@ -0,0 +1,46 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import os
import yaml
# with ssnpp sharded data
root = "/checkpoint/marialomeli/ssnpp_data"
file_names = [f"ssnpp_{i:010}.npy" for i in range(20)]
d = 256
dt = np.dtype(np.uint8)
def read_embeddings(fp):
fl = os.path.getsize(fp)
nb = fl // d // dt.itemsize
print(nb)
if fl == d * dt.itemsize * nb: # no header
return ("raw", np.memmap(fp, shape=(nb, d), dtype=dt, mode="r"))
else: # assume npy
vecs = np.load(fp, mmap_mode="r")
assert vecs.shape[1] == d
assert vecs.dtype == dt
return ("npy", vecs)
cfg = {}
files = []
size = 0
for fn in file_names:
fp = f"{root}/{fn}"
assert os.path.exists(fp), f"{fp} is missing"
ft, xb = read_embeddings(fp)
files.append(
{"name": fn, "size": xb.shape[0], "dtype": dt.name, "format": ft}
)
size += xb.shape[0]
cfg["size"] = size
cfg["root"] = root
cfg["d"] = d
cfg["files"] = files
print(yaml.dump(cfg))

View File

@@ -0,0 +1,891 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import faiss
import numpy as np
import os
from tqdm import tqdm, trange
import sys
import logging
from faiss.contrib.ondisk import merge_ondisk
from faiss.contrib.big_batch_search import big_batch_search
from faiss.contrib.exhaustive_search import knn_ground_truth
from faiss.contrib.evaluation import knn_intersection_measure
from utils import (
get_intersection_cardinality_frequencies,
margin,
is_pretransform_index,
)
from dataset import create_dataset_from_oivf_config
logging.basicConfig(
format=(
"%(asctime)s.%(msecs)03d %(levelname)-8s %(threadName)-12s %(message)s"
),
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
force=True,
)
EMBEDDINGS_BATCH_SIZE: int = 100_000
NUM_SUBSAMPLES: int = 100
SMALL_DATA_SAMPLE: int = 10000
class OfflineIVF:
def __init__(self, cfg, args, nprobe, index_factory_str):
self.input_d = cfg["d"]
self.dt = cfg["datasets"][args.xb]["files"][0]["dtype"]
assert self.input_d > 0
output_dir = cfg["output"]
assert os.path.exists(output_dir)
self.index_factory = index_factory_str
assert self.index_factory is not None
self.index_factory_fn = self.index_factory.replace(",", "_")
self.index_template_file = (
f"{output_dir}/{args.xb}/{self.index_factory_fn}.empty.faissindex"
)
logging.info(f"index template: {self.index_template_file}")
if not args.xq:
args.xq = args.xb
self.by_residual = True
if args.no_residuals:
self.by_residual = False
xb_output_dir = f"{output_dir}/{args.xb}"
if not os.path.exists(xb_output_dir):
os.makedirs(xb_output_dir)
xq_output_dir = f"{output_dir}/{args.xq}"
if not os.path.exists(xq_output_dir):
os.makedirs(xq_output_dir)
search_output_dir = f"{output_dir}/{args.xq}_in_{args.xb}"
if not os.path.exists(search_output_dir):
os.makedirs(search_output_dir)
self.knn_dir = f"{search_output_dir}/knn"
if not os.path.exists(self.knn_dir):
os.makedirs(self.knn_dir)
self.eval_dir = f"{search_output_dir}/eval"
if not os.path.exists(self.eval_dir):
os.makedirs(self.eval_dir)
self.index = {} # to keep a reference to opened indices,
self.ivls = {} # hstack inverted lists,
self.index_shards = {} # and index shards
self.index_shard_prefix = (
f"{xb_output_dir}/{self.index_factory_fn}.shard_"
)
self.xq_index_shard_prefix = (
f"{xq_output_dir}/{self.index_factory_fn}.shard_"
)
self.index_file = ( # TODO: added back temporarily for evaluate, handle name of non-sharded index file and remove.
f"{xb_output_dir}/{self.index_factory_fn}.faissindex"
)
self.xq_index_file = (
f"{xq_output_dir}/{self.index_factory_fn}.faissindex"
)
self.training_sample = cfg["training_sample"]
self.evaluation_sample = cfg["evaluation_sample"]
self.xq_ds = create_dataset_from_oivf_config(cfg, args.xq)
self.xb_ds = create_dataset_from_oivf_config(cfg, args.xb)
file_descriptors = self.xq_ds.file_descriptors
self.file_sizes = [fd.size for fd in file_descriptors]
self.shard_size = cfg["index_shard_size"] # ~100GB
self.nshards = self.xb_ds.size // self.shard_size
if self.xb_ds.size % self.shard_size != 0:
self.nshards += 1
self.xq_nshards = self.xq_ds.size // self.shard_size
if self.xq_ds.size % self.shard_size != 0:
self.xq_nshards += 1
self.nprobe = nprobe
assert self.nprobe > 0, "Invalid nprobe parameter."
if "deduper" in cfg:
self.deduper = cfg["deduper"]
self.deduper_codec_fn = [
f"{xb_output_dir}/deduper_codec_{codec.replace(',', '_')}"
for codec in self.deduper
]
self.deduper_idx_fn = [
f"{xb_output_dir}/deduper_idx_{codec.replace(',', '_')}"
for codec in self.deduper
]
else:
self.deduper = None
self.k = cfg["k"]
assert self.k > 0, "Invalid number of neighbours parameter."
self.knn_output_file_suffix = (
f"{self.index_factory_fn}_np{self.nprobe}.npy"
)
fp = 32
if self.dt == "float16":
fp = 16
self.xq_bs = cfg["query_batch_size"]
if "metric" in cfg:
self.metric = eval(f'faiss.{cfg["metric"]}')
else:
self.metric = faiss.METRIC_L2
if "evaluate_by_margin" in cfg:
self.evaluate_by_margin = cfg["evaluate_by_margin"]
else:
self.evaluate_by_margin = False
os.system("grep -m1 'model name' < /proc/cpuinfo")
os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
os.system("nvidia-smi")
os.system("nvcc --version")
self.knn_queries_memory_limit = 4 * 1024 * 1024 * 1024 # 4 GB
self.knn_vectors_memory_limit = 8 * 1024 * 1024 * 1024 # 8 GB
def input_stats(self):
"""
Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
"""
xb_sample = self.xb_ds.get_first_n(self.training_sample, np.float32)
logging.info(f"input shape: {xb_sample.shape}")
logging.info("running MatrixStats on training sample...")
logging.info(faiss.MatrixStats(xb_sample).comments)
logging.info("done")
def dedupe(self):
logging.info(self.deduper)
if self.deduper is None:
logging.info("No deduper configured")
return
codecs = []
codesets = []
idxs = []
for factory, filename in zip(self.deduper, self.deduper_codec_fn):
if os.path.exists(filename):
logging.info(f"loading trained dedupe codec: {filename}")
codec = faiss.read_index(filename)
else:
logging.info(f"training dedupe codec: {factory}")
codec = faiss.index_factory(self.input_d, factory)
xb_sample = np.unique(
self.xb_ds.get_first_n(100_000, np.float32), axis=0
)
faiss.ParameterSpace().set_index_parameter(codec, "verbose", 1)
codec.train(xb_sample)
logging.info(f"writing trained dedupe codec: {filename}")
faiss.write_index(codec, filename)
codecs.append(codec)
codesets.append(faiss.CodeSet(codec.sa_code_size()))
idxs.append(np.empty((0,), dtype=np.uint32))
bs = 1_000_000
i = 0
for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
for j in range(len(codecs)):
codec, codeset, idx = codecs[j], codesets[j], idxs[j]
uniq = codeset.insert(codec.sa_encode(buffer))
idxs[j] = np.append(
idx,
np.arange(i, i + buffer.shape[0], dtype=np.uint32)[uniq],
)
i += buffer.shape[0]
for idx, filename in zip(idxs, self.deduper_idx_fn):
logging.info(f"writing {filename}, shape: {idx.shape}")
np.save(filename, idx)
logging.info("done")
def train_index(self):
"""
Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
"""
assert not os.path.exists(self.index_template_file), (
"The train command has been ran, the index template file already"
" exists."
)
xb_sample = np.unique(
self.xb_ds.get_first_n(self.training_sample, np.float32), axis=0
)
logging.info(f"input shape: {xb_sample.shape}")
index = faiss.index_factory(
self.input_d, self.index_factory, self.metric
)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
index_ivf.by_residual = True
faiss.ParameterSpace().set_index_parameter(index, "verbose", 1)
logging.info("running training...")
index.train(xb_sample)
logging.info(f"writing trained index {self.index_template_file}...")
faiss.write_index(index, self.index_template_file)
logging.info("done")
def _iterate_transformed(self, ds, start, batch_size, dt):
assert os.path.exists(self.index_template_file)
index = faiss.read_index(self.index_template_file)
if is_pretransform_index(index):
vt = index.chain.at(0) # fetch pretransform
for buffer in ds.iterate(start, batch_size, dt):
yield vt.apply(buffer)
else:
for buffer in ds.iterate(start, batch_size, dt):
yield buffer
def index_shard(self):
assert os.path.exists(self.index_template_file)
index = faiss.read_index(self.index_template_file)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
assert self.nprobe <= index_ivf.quantizer.ntotal, (
f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
f" to retrieve {self.nprobe} neighbours, check."
)
cpu_quantizer = index_ivf.quantizer
gpu_quantizer = faiss.index_cpu_to_all_gpus(cpu_quantizer)
for i in range(0, self.nshards):
sfn = f"{self.index_shard_prefix}{i}"
try:
index.reset()
index_ivf.quantizer = gpu_quantizer
with open(sfn, "xb"):
start = i * self.shard_size
jj = 0
embeddings_batch_size = min(
EMBEDDINGS_BATCH_SIZE, self.shard_size
)
assert (
self.shard_size % embeddings_batch_size == 0
or EMBEDDINGS_BATCH_SIZE % embeddings_batch_size == 0
), (
f"the shard size {self.shard_size} and embeddings"
f" shard size {EMBEDDINGS_BATCH_SIZE} are not"
" divisible"
)
for xb_j in tqdm(
self._iterate_transformed(
self.xb_ds,
start,
embeddings_batch_size,
np.float32,
),
file=sys.stdout,
):
if is_pretransform_index(index):
assert xb_j.shape[1] == index.chain.at(0).d_out
index_ivf.add_with_ids(
xb_j,
np.arange(start + jj, start + jj + xb_j.shape[0]),
)
else:
assert xb_j.shape[1] == index.d
index.add_with_ids(
xb_j,
np.arange(start + jj, start + jj + xb_j.shape[0]),
)
jj += xb_j.shape[0]
logging.info(jj)
assert (
jj <= self.shard_size
), f"jj {jj} and shard_zide {self.shard_size}"
if jj == self.shard_size:
break
logging.info(f"writing {sfn}...")
index_ivf.quantizer = cpu_quantizer
faiss.write_index(index, sfn)
except FileExistsError:
logging.info(f"skipping shard: {i}")
continue
logging.info("done")
def merge_index(self):
ivf_file = f"{self.index_file}.ivfdata"
assert os.path.exists(self.index_template_file)
assert not os.path.exists(
ivf_file
), f"file with embeddings data {ivf_file} not found, check."
assert not os.path.exists(self.index_file)
index = faiss.read_index(self.index_template_file)
block_fnames = [
f"{self.index_shard_prefix}{i}" for i in range(self.nshards)
]
for fn in block_fnames:
assert os.path.exists(fn)
logging.info(block_fnames)
logging.info("merging...")
merge_ondisk(index, block_fnames, ivf_file)
logging.info("writing index...")
faiss.write_index(index, self.index_file)
logging.info("done")
def _cached_search(
self,
sample,
xq_ds,
xb_ds,
idx_file,
vecs_file,
I_file,
D_file,
index_file=None,
nprobe=None,
):
if not os.path.exists(I_file):
assert not os.path.exists(I_file), f"file {I_file} does not exist "
assert not os.path.exists(D_file), f"file {D_file} does not exist "
xq = xq_ds.sample(sample, idx_file, vecs_file)
if index_file:
D, I = self._index_nonsharded_search(index_file, xq, nprobe)
else:
logging.info("ground truth computations")
db_iterator = xb_ds.iterate(0, 100_000, np.float32)
D, I = knn_ground_truth(
xq, db_iterator, self.k, metric_type=self.metric
)
assert np.amin(I) >= 0
np.save(I_file, I)
np.save(D_file, D)
else:
assert os.path.exists(idx_file), f"file {idx_file} does not exist "
assert os.path.exists(
vecs_file
), f"file {vecs_file} does not exist "
assert os.path.exists(I_file), f"file {I_file} does not exist "
assert os.path.exists(D_file), f"file {D_file} does not exist "
I = np.load(I_file)
D = np.load(D_file)
assert I.shape == (sample, self.k), f"{I_file} shape mismatch"
assert D.shape == (sample, self.k), f"{D_file} shape mismatch"
return (D, I)
def _index_search(self, index_shard_prefix, xq, nprobe):
assert nprobe is not None
logging.info(
f"open sharded index: {index_shard_prefix}, {self.nshards}"
)
index = self._open_sharded_index(index_shard_prefix)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
logging.info(f"setting nprobe to {nprobe}")
index_ivf.nprobe = nprobe
return index.search(xq, self.k)
def _index_nonsharded_search(self, index_file, xq, nprobe):
assert nprobe is not None
logging.info(f"index {index_file}")
assert os.path.exists(index_file), f"file {index_file} does not exist "
index = faiss.read_index(index_file, faiss.IO_FLAG_ONDISK_SAME_DIR)
logging.info(f"index size {index.ntotal} ")
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
logging.info(f"setting nprobe to {nprobe}")
index_ivf.nprobe = nprobe
return index.search(xq, self.k)
def _refine_distances(self, xq_ds, idx, xb_ds, I):
xq = xq_ds.get(idx).repeat(self.k, axis=0)
xb = xb_ds.get(I.reshape(-1))
if self.metric == faiss.METRIC_INNER_PRODUCT:
return (xq * xb).sum(axis=1).reshape(I.shape)
elif self.metric == faiss.METRIC_L2:
return ((xq - xb) ** 2).sum(axis=1).reshape(I.shape)
else:
raise ValueError(f"metric not supported {self.metric}")
def evaluate(self):
self._evaluate(
self.index_factory_fn,
self.index_file,
self.xq_index_file,
self.nprobe,
)
def _evaluate(self, index_factory_fn, index_file, xq_index_file, nprobe):
idx_a_file = f"{self.eval_dir}/idx_a.npy"
idx_b_gt_file = f"{self.eval_dir}/idx_b_gt.npy"
idx_b_ann_file = (
f"{self.eval_dir}/idx_b_ann_{index_factory_fn}_np{nprobe}.npy"
)
vecs_a_file = f"{self.eval_dir}/vecs_a.npy"
vecs_b_gt_file = f"{self.eval_dir}/vecs_b_gt.npy"
vecs_b_ann_file = (
f"{self.eval_dir}/vecs_b_ann_{index_factory_fn}_np{nprobe}.npy"
)
D_a_gt_file = f"{self.eval_dir}/D_a_gt.npy"
D_a_ann_file = (
f"{self.eval_dir}/D_a_ann_{index_factory_fn}_np{nprobe}.npy"
)
D_a_ann_refined_file = f"{self.eval_dir}/D_a_ann_refined_{index_factory_fn}_np{nprobe}.npy"
D_b_gt_file = f"{self.eval_dir}/D_b_gt.npy"
D_b_ann_file = (
f"{self.eval_dir}/D_b_ann_{index_factory_fn}_np{nprobe}.npy"
)
D_b_ann_gt_file = (
f"{self.eval_dir}/D_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
)
I_a_gt_file = f"{self.eval_dir}/I_a_gt.npy"
I_a_ann_file = (
f"{self.eval_dir}/I_a_ann_{index_factory_fn}_np{nprobe}.npy"
)
I_b_gt_file = f"{self.eval_dir}/I_b_gt.npy"
I_b_ann_file = (
f"{self.eval_dir}/I_b_ann_{index_factory_fn}_np{nprobe}.npy"
)
I_b_ann_gt_file = (
f"{self.eval_dir}/I_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
)
margin_gt_file = f"{self.eval_dir}/margin_gt.npy"
margin_refined_file = (
f"{self.eval_dir}/margin_refined_{index_factory_fn}_np{nprobe}.npy"
)
margin_ann_file = (
f"{self.eval_dir}/margin_ann_{index_factory_fn}_np{nprobe}.npy"
)
logging.info("exact search forward")
# xq -> xb AKA a -> b
D_a_gt, I_a_gt = self._cached_search(
self.evaluation_sample,
self.xq_ds,
self.xb_ds,
idx_a_file,
vecs_a_file,
I_a_gt_file,
D_a_gt_file,
)
idx_a = np.load(idx_a_file)
logging.info("approximate search forward")
D_a_ann, I_a_ann = self._cached_search(
self.evaluation_sample,
self.xq_ds,
self.xb_ds,
idx_a_file,
vecs_a_file,
I_a_ann_file,
D_a_ann_file,
index_file,
nprobe,
)
logging.info(
"calculate refined distances on approximate search forward"
)
if os.path.exists(D_a_ann_refined_file):
D_a_ann_refined = np.load(D_a_ann_refined_file)
assert D_a_ann.shape == D_a_ann_refined.shape
else:
D_a_ann_refined = self._refine_distances(
self.xq_ds, idx_a, self.xb_ds, I_a_ann
)
np.save(D_a_ann_refined_file, D_a_ann_refined)
if self.evaluate_by_margin:
k_extract = self.k
margin_threshold = 1.05
logging.info(
"exact search backward from the k_extract NN results of"
" forward search"
)
# xb -> xq AKA b -> a
D_a_b_gt = D_a_gt[:, :k_extract].ravel()
idx_b_gt = I_a_gt[:, :k_extract].ravel()
assert len(idx_b_gt) == self.evaluation_sample * k_extract
np.save(idx_b_gt_file, idx_b_gt)
# exact search
D_b_gt, _ = self._cached_search(
len(idx_b_gt),
self.xb_ds,
self.xq_ds,
idx_b_gt_file,
vecs_b_gt_file,
I_b_gt_file,
D_b_gt_file,
) # xb and xq ^^^ are inverted
logging.info("margin on exact search")
margin_gt = margin(
self.evaluation_sample,
idx_a,
idx_b_gt,
D_a_b_gt,
D_a_gt,
D_b_gt,
self.k,
k_extract,
margin_threshold,
)
np.save(margin_gt_file, margin_gt)
logging.info(
"exact search backward from the k_extract NN results of"
" approximate forward search"
)
D_a_b_refined = D_a_ann_refined[:, :k_extract].ravel()
idx_b_ann = I_a_ann[:, :k_extract].ravel()
assert len(idx_b_ann) == self.evaluation_sample * k_extract
np.save(idx_b_ann_file, idx_b_ann)
# exact search
D_b_ann_gt, _ = self._cached_search(
len(idx_b_ann),
self.xb_ds,
self.xq_ds,
idx_b_ann_file,
vecs_b_ann_file,
I_b_ann_gt_file,
D_b_ann_gt_file,
) # xb and xq ^^^ are inverted
logging.info("refined margin on approximate search")
margin_refined = margin(
self.evaluation_sample,
idx_a,
idx_b_ann,
D_a_b_refined,
D_a_gt, # not D_a_ann_refined(!)
D_b_ann_gt,
self.k,
k_extract,
margin_threshold,
)
np.save(margin_refined_file, margin_refined)
D_b_ann, I_b_ann = self._cached_search(
len(idx_b_ann),
self.xb_ds,
self.xq_ds,
idx_b_ann_file,
vecs_b_ann_file,
I_b_ann_file,
D_b_ann_file,
xq_index_file,
nprobe,
)
D_a_b_ann = D_a_ann[:, :k_extract].ravel()
logging.info("approximate search margin")
margin_ann = margin(
self.evaluation_sample,
idx_a,
idx_b_ann,
D_a_b_ann,
D_a_ann,
D_b_ann,
self.k,
k_extract,
margin_threshold,
)
np.save(margin_ann_file, margin_ann)
logging.info("intersection")
logging.info(I_a_gt)
logging.info(I_a_ann)
for i in range(1, self.k + 1):
logging.info(
f"{i}: {knn_intersection_measure(I_a_gt[:,:i], I_a_ann[:,:i])}"
)
logging.info(f"mean of gt distances: {D_a_gt.mean()}")
logging.info(f"mean of approx distances: {D_a_ann.mean()}")
logging.info(f"mean of refined distances: {D_a_ann_refined.mean()}")
logging.info("intersection cardinality frequencies")
logging.info(get_intersection_cardinality_frequencies(I_a_ann, I_a_gt))
logging.info("done")
pass
def _knn_function(self, xq, xb, k, metric, thread_id=None):
try:
return faiss.knn_gpu(
self.all_gpu_resources[thread_id],
xq,
xb,
k,
metric=metric,
device=thread_id,
vectorsMemoryLimit=self.knn_vectors_memory_limit,
queriesMemoryLimit=self.knn_queries_memory_limit,
)
except Exception:
logging.info(f"knn_function failed: {xq.shape}, {xb.shape}")
raise
def _coarse_quantize(self, index_ivf, xq, nprobe):
assert nprobe <= index_ivf.quantizer.ntotal
quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
bs = 100_000
nq = len(xq)
q_assign = np.empty((nq, nprobe), dtype="int32")
for i0 in trange(0, nq, bs):
i1 = min(nq, i0 + bs)
_, q_assign_i = quantizer.search(xq[i0:i1], nprobe)
q_assign[i0:i1] = q_assign_i
return q_assign
def search(self):
logging.info(f"search: {self.knn_dir}")
slurm_job_id = os.environ.get("SLURM_JOB_ID")
ngpu = faiss.get_num_gpus()
logging.info(f"number of gpus: {ngpu}")
self.all_gpu_resources = [
faiss.StandardGpuResources() for _ in range(ngpu)
]
self._knn_function(
np.zeros((10, 10), dtype=np.float16),
np.zeros((10, 10), dtype=np.float16),
self.k,
metric=self.metric,
thread_id=0,
)
index = self._open_sharded_index()
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
logging.info(f"setting nprobe to {self.nprobe}")
index_ivf.nprobe = self.nprobe
# quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
for i in range(0, self.xq_ds.size, self.xq_bs):
Ifn = f"{self.knn_dir}/I{(i):010}_{self.knn_output_file_suffix}"
Dfn = f"{self.knn_dir}/D_approx{(i):010}_{self.knn_output_file_suffix}"
CPfn = f"{self.knn_dir}/CP{(i):010}_{self.knn_output_file_suffix}"
if slurm_job_id:
worker_record = (
self.knn_dir
+ f"/record_{(i):010}_{self.knn_output_file_suffix}.txt"
)
if not os.path.exists(worker_record):
logging.info(
f"creating record file {worker_record} and saving job"
f" id: {slurm_job_id}"
)
with open(worker_record, "w") as h:
h.write(slurm_job_id)
else:
old_slurm_id = open(worker_record, "r").read()
logging.info(
f"old job slurm id {old_slurm_id} and current job id:"
f" {slurm_job_id}"
)
if old_slurm_id == slurm_job_id:
if os.path.getsize(Ifn) == 0:
logging.info(
f"cleaning up zero length files {Ifn} and"
f" {Dfn}"
)
os.remove(Ifn)
os.remove(Dfn)
try:
if is_pretransform_index(index):
d = index.chain.at(0).d_out
else:
d = self.input_d
with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
xq_i = np.empty(
shape=(self.xq_bs, d), dtype=np.float16
)
q_assign = np.empty(
(self.xq_bs, self.nprobe), dtype=np.int32
)
j = 0
quantizer = faiss.index_cpu_to_all_gpus(
index_ivf.quantizer
)
for xq_i_j in tqdm(
self._iterate_transformed(
self.xq_ds, i, min(100_000, self.xq_bs), np.float16
),
file=sys.stdout,
):
xq_i[j:j + xq_i_j.shape[0]] = xq_i_j
(
_,
q_assign[j:j + xq_i_j.shape[0]],
) = quantizer.search(xq_i_j, self.nprobe)
j += xq_i_j.shape[0]
assert j <= xq_i.shape[0]
if j == xq_i.shape[0]:
break
xq_i = xq_i[:j]
q_assign = q_assign[:j]
assert q_assign.shape == (xq_i.shape[0], index_ivf.nprobe)
del quantizer
logging.info(f"computing: {Ifn}")
logging.info(f"computing: {Dfn}")
prefetch_threads = faiss.get_num_gpus()
D_ann, I = big_batch_search(
index_ivf,
xq_i,
self.k,
verbose=10,
method="knn_function",
knn=self._knn_function,
threaded=faiss.get_num_gpus() * 8,
use_float16=True,
prefetch_threads=prefetch_threads,
computation_threads=faiss.get_num_gpus(),
q_assign=q_assign,
checkpoint=CPfn,
checkpoint_freq=7200, # in seconds
)
assert (
np.amin(I) >= 0
), f"{I}, there exists negative indices, check"
logging.info(f"saving: {Ifn}")
np.save(f, I)
logging.info(f"saving: {Dfn}")
np.save(g, D_ann)
if os.path.exists(CPfn):
logging.info(f"removing: {CPfn}")
os.remove(CPfn)
except FileExistsError:
logging.info(f"skipping {Ifn}, already exists")
logging.info(f"skipping {Dfn}, already exists")
continue
def _open_index_shard(self, fn):
if fn in self.index_shards:
index_shard = self.index_shards[fn]
else:
logging.info(f"open index shard: {fn}")
index_shard = faiss.read_index(
fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
)
self.index_shards[fn] = index_shard
return index_shard
def _open_sharded_index(self, index_shard_prefix=None):
if index_shard_prefix is None:
index_shard_prefix = self.index_shard_prefix
if index_shard_prefix in self.index:
return self.index[index_shard_prefix]
assert os.path.exists(
self.index_template_file
), f"file {self.index_template_file} does not exist "
logging.info(f"open index template: {self.index_template_file}")
index = faiss.read_index(self.index_template_file)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
ilv = faiss.InvertedListsPtrVector()
for i in range(self.nshards):
fn = f"{index_shard_prefix}{i}"
assert os.path.exists(fn), f"file {fn} does not exist "
logging.info(fn)
index_shard = self._open_index_shard(fn)
il = faiss.downcast_index(
faiss.extract_index_ivf(index_shard)
).invlists
ilv.push_back(il)
hsil = faiss.HStackInvertedLists(ilv.size(), ilv.data())
index_ivf.replace_invlists(hsil, False)
self.ivls[index_shard_prefix] = hsil
self.index[index_shard_prefix] = index
return index
def index_shard_stats(self):
for i in range(self.nshards):
fn = f"{self.index_shard_prefix}{i}"
assert os.path.exists(fn)
index = faiss.read_index(
fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
il = index_ivf.invlists
il.print_stats()
def index_stats(self):
index = self._open_sharded_index()
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
il = index_ivf.invlists
list_sizes = [il.list_size(i) for i in range(il.nlist)]
logging.info(np.max(list_sizes))
logging.info(np.mean(list_sizes))
logging.info(np.argmax(list_sizes))
logging.info("index_stats:")
il.print_stats()
def consistency_check(self):
logging.info("consistency-check")
logging.info("index template...")
assert os.path.exists(self.index_template_file)
index = faiss.read_index(self.index_template_file)
offset = 0 # 2**24
assert self.shard_size > offset + SMALL_DATA_SAMPLE
logging.info("index shards...")
for i in range(self.nshards):
r = i * self.shard_size + offset
xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
fn = f"{self.index_shard_prefix}{i}"
assert os.path.exists(fn), f"There is no index shard file {fn}"
index = self._open_index_shard(fn)
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
index_ivf.nprobe = 1
_, I = index.search(xb, 100)
for j in range(SMALL_DATA_SAMPLE):
assert np.where(I[j] == j + r)[0].size > 0, (
f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
f" {self.shard_size}"
)
logging.info("merged index...")
index = self._open_sharded_index()
index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
index_ivf.nprobe = 1
for i in range(self.nshards):
r = i * self.shard_size + offset
xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
_, I = index.search(xb, 100)
for j in range(SMALL_DATA_SAMPLE):
assert np.where(I[j] == j + r)[0].size > 0, (
f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
f" {self.shard_size}")
logging.info("search results...")
index_ivf.nprobe = self.nprobe
for i in range(0, self.xq_ds.size, self.xq_bs):
Ifn = f"{self.knn_dir}/I{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
assert os.path.exists(Ifn)
assert os.path.getsize(Ifn) > 0, f"The file {Ifn} is empty."
logging.info(Ifn)
I = np.load(Ifn, mmap_mode="r")
assert I.shape[1] == self.k
assert I.shape[0] == min(self.xq_bs, self.xq_ds.size - i)
assert np.all(I[:, 1] >= 0)
Dfn = f"{self.knn_dir}/D_approx{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
assert os.path.exists(Dfn)
assert os.path.getsize(Dfn) > 0, f"The file {Dfn} is empty."
logging.info(Dfn)
D = np.load(Dfn, mmap_mode="r")
assert D.shape == I.shape
xq = next(self.xq_ds.iterate(i, SMALL_DATA_SAMPLE, np.float32))
D_online, I_online = index.search(xq, self.k)
assert (
np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size
/ (self.k * SMALL_DATA_SAMPLE)
> 0.95
), (
"the ratio is"
f" {np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size / (self.k * SMALL_DATA_SAMPLE)}"
)
assert np.allclose(
D[:SMALL_DATA_SAMPLE].sum(axis=1),
D_online.sum(axis=1),
rtol=0.01,
), (
"the difference is"
f" {D[:SMALL_DATA_SAMPLE].sum(axis=1), D_online.sum(axis=1)}"
)
logging.info("done")

View File

@@ -0,0 +1,219 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from utils import (
load_config,
add_group_args,
)
from offline_ivf import OfflineIVF
import faiss
from typing import List, Callable, Dict
import submitit
def join_lists_in_dict(poss: List[str]) -> List[str]:
"""
Joins two lists of prod and non-prod values, checking if the prod value is already included.
If there is no non-prod list, it returns the prod list.
"""
if "non-prod" in poss.keys():
all_poss = poss["non-prod"]
if poss["prod"][-1] not in poss["non-prod"]:
all_poss += poss["prod"]
return all_poss
else:
return poss["prod"]
def main(
args: argparse.Namespace,
cfg: Dict[str, str],
nprobe: int,
index_factory_str: str,
) -> None:
oivf = OfflineIVF(cfg, args, nprobe, index_factory_str)
eval(f"oivf.{args.command}()")
def process_options_and_run_jobs(args: argparse.Namespace) -> None:
"""
If "--cluster_run", it launches an array of jobs to the cluster using the submitit library for all the index strings. In
the case of evaluate, it launches a job for each index string and nprobe pair. Otherwise, it launches a single job
that is ran locally with the prod values for index string and nprobe.
"""
cfg = load_config(args.config)
index_strings = cfg["index"]
nprobes = cfg["nprobe"]
if args.command == "evaluate":
if args.cluster_run:
all_nprobes = join_lists_in_dict(nprobes)
all_index_strings = join_lists_in_dict(index_strings)
for index_factory_str in all_index_strings:
for nprobe in all_nprobes:
launch_job(main, args, cfg, nprobe, index_factory_str)
else:
launch_job(
main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
)
else:
if args.cluster_run:
all_index_strings = join_lists_in_dict(index_strings)
for index_factory_str in all_index_strings:
launch_job(
main, args, cfg, nprobes["prod"][-1], index_factory_str
)
else:
launch_job(
main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
)
def launch_job(
func: Callable,
args: argparse.Namespace,
cfg: Dict[str, str],
n_probe: int,
index_str: str,
) -> None:
"""
Launches an array of slurm jobs to the cluster using the submitit library.
"""
if args.cluster_run:
assert args.num_nodes >= 1
executor = submitit.AutoExecutor(folder=args.logs_dir)
executor.update_parameters(
nodes=args.num_nodes,
gpus_per_node=args.gpus_per_node,
cpus_per_task=args.cpus_per_task,
tasks_per_node=args.tasks_per_node,
name=args.job_name,
slurm_partition=args.partition,
slurm_time=70 * 60,
)
if args.slurm_constraint:
executor.update_parameters(slurm_constraint=args.slurm_constrain)
job = executor.submit(func, args, cfg, n_probe, index_str)
print(f"Job id: {job.job_id}")
else:
func(args, cfg, n_probe, index_str)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
group = parser.add_argument_group("general")
add_group_args(group, "--command", required=True, help="command to run")
add_group_args(
group,
"--config",
required=True,
help="config yaml with the dataset specs",
)
add_group_args(
group, "--nt", type=int, default=96, help="nb search threads"
)
add_group_args(
group,
"--no_residuals",
action="store_false",
help="set index.by_residual to False during train index.",
)
group = parser.add_argument_group("slurm_job")
add_group_args(
group,
"--cluster_run",
action="store_true",
help=" if True, runs in cluster",
)
add_group_args(
group,
"--job_name",
type=str,
default="oivf",
help="cluster job name",
)
add_group_args(
group,
"--num_nodes",
type=str,
default=1,
help="num of nodes per job",
)
add_group_args(
group,
"--tasks_per_node",
type=int,
default=1,
help="tasks per job",
)
add_group_args(
group,
"--gpus_per_node",
type=int,
default=8,
help="cluster job name",
)
add_group_args(
group,
"--cpus_per_task",
type=int,
default=80,
help="cluster job name",
)
add_group_args(
group,
"--logs_dir",
type=str,
default="/checkpoint/marialomeli/offline_faiss/logs",
help="cluster job name",
)
add_group_args(
group,
"--slurm_constraint",
type=str,
default=None,
help="can be volta32gb for the fair cluster",
)
add_group_args(
group,
"--partition",
type=str,
default="learnlab",
help="specify which partition to use if ran on cluster with job arrays",
choices=[
"learnfair",
"devlab",
"scavenge",
"learnlab",
"nllb",
"seamless",
"seamless_medium",
"learnaccel",
"onellm_low",
"learn",
"scavenge",
],
)
group = parser.add_argument_group("dataset")
add_group_args(group, "--xb", required=True, help="database vectors")
add_group_args(group, "--xq", help="query vectors")
args = parser.parse_args()
print("args:", args)
faiss.omp_set_num_threads(args.nt)
process_options_and_run_jobs(args=args)

View File

@@ -0,0 +1,181 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import yaml
import numpy as np
from typing import Dict, List, Optional
OIVF_TEST_ARGS: List[str] = [
"--config",
"--xb",
"--xq",
"--command",
"--cluster_run",
"--no_residuals",
]
def get_test_parser(args) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
for arg in args:
parser.add_argument(arg)
return parser
class TestDataCreator:
def __init__(
self,
tempdir: str,
dimension: int,
data_type: np.dtype,
index_factory: Optional[List] = ["OPQ4,IVF256,PQ4"],
training_sample: Optional[int] = 9984,
index_shard_size: Optional[int] = 1000,
query_batch_size: Optional[int] = 1000,
evaluation_sample: Optional[int] = 100,
num_files: Optional[int] = None,
file_size: Optional[int] = None,
file_sizes: Optional[List] = None,
nprobe: Optional[int] = 64,
k: Optional[int] = 10,
metric: Optional[str] = "METRIC_L2",
normalise: Optional[bool] = False,
with_queries_ds: Optional[bool] = False,
evaluate_by_margin: Optional[bool] = False,
) -> None:
self.tempdir = tempdir
self.dimension = dimension
self.data_type = np.dtype(data_type).name
self.index_factory = {"prod": index_factory}
if file_size and num_files:
self.file_sizes = [file_size for _ in range(num_files)]
elif file_sizes:
self.file_sizes = file_sizes
else:
raise ValueError("no file sizes provided")
self.num_files = len(self.file_sizes)
self.training_sample = training_sample
self.index_shard_size = index_shard_size
self.query_batch_size = query_batch_size
self.evaluation_sample = evaluation_sample
self.nprobe = {"prod": [nprobe]}
self.k = k
self.metric = metric
self.normalise = normalise
self.config_file = self.tempdir + "/config_test.yaml"
self.ds_name = "my_test_data"
self.qs_name = "my_queries_data"
self.evaluate_by_margin = evaluate_by_margin
self.with_queries_ds = with_queries_ds
def create_test_data(self) -> None:
datafiles = self._create_data_files()
files_info = []
for i, file in enumerate(datafiles):
files_info.append(
{
"dtype": self.data_type,
"format": "npy",
"name": file,
"size": self.file_sizes[i],
}
)
config_for_yaml = {
"d": self.dimension,
"output": self.tempdir,
"index": self.index_factory,
"nprobe": self.nprobe,
"k": self.k,
"normalise": self.normalise,
"metric": self.metric,
"training_sample": self.training_sample,
"evaluation_sample": self.evaluation_sample,
"index_shard_size": self.index_shard_size,
"query_batch_size": self.query_batch_size,
"datasets": {
self.ds_name: {
"root": self.tempdir,
"size": sum(self.file_sizes),
"files": files_info,
}
},
}
if self.evaluate_by_margin:
config_for_yaml["evaluate_by_margin"] = self.evaluate_by_margin
q_datafiles = self._create_data_files("my_q_data")
q_files_info = []
for i, file in enumerate(q_datafiles):
q_files_info.append(
{
"dtype": self.data_type,
"format": "npy",
"name": file,
"size": self.file_sizes[i],
}
)
if self.with_queries_ds:
config_for_yaml["datasets"][self.qs_name] = {
"root": self.tempdir,
"size": sum(self.file_sizes),
"files": q_files_info,
}
self._create_config_yaml(config_for_yaml)
def setup_cli(self, command="consistency_check") -> argparse.Namespace:
parser = get_test_parser(OIVF_TEST_ARGS)
if self.with_queries_ds:
return parser.parse_args(
[
"--xb",
self.ds_name,
"--config",
self.config_file,
"--command",
command,
"--xq",
self.qs_name,
]
)
return parser.parse_args(
[
"--xb",
self.ds_name,
"--config",
self.config_file,
"--command",
command,
]
)
def _create_data_files(self, name_of_file="my_data") -> List[str]:
"""
Creates a dataset "my_test_data" with number of files (num_files), using padding in the files
name. If self.with_queries is True, it adds an extra dataset "my_queries_data" with the same number of files
as the "my_test_data". The default name for embeddings files is "my_data" + <padding>.npy.
"""
filenames = []
for i, file_size in enumerate(self.file_sizes):
# np.random.seed(i)
db_vectors = np.random.random((file_size, self.dimension)).astype(
self.data_type
)
filename = name_of_file + f"{i:02}" + ".npy"
filenames.append(filename)
np.save(self.tempdir + "/" + filename, db_vectors)
return filenames
def _create_config_yaml(self, dict_file: Dict[str, str]) -> None:
"""
Creates a yaml file in dir (can be a temporary dir for tests).
"""
filename = self.tempdir + "/config_test.yaml"
with open(filename, "w") as file:
yaml.dump(dict_file, file, default_flow_style=False)

View File

@@ -0,0 +1,95 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import os
from typing import Dict
import yaml
import faiss
from faiss.contrib.datasets import SyntheticDataset
def load_config(config):
assert os.path.exists(config)
with open(config, "r") as f:
return yaml.safe_load(f)
def faiss_sanity_check():
ds = SyntheticDataset(256, 0, 100, 100)
xq = ds.get_queries()
xb = ds.get_database()
index_cpu = faiss.IndexFlat(ds.d)
index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
index_cpu.add(xb)
index_gpu.add(xb)
D_cpu, I_cpu = index_cpu.search(xq, 10)
D_gpu, I_gpu = index_gpu.search(xq, 10)
assert np.all(I_cpu == I_gpu), "faiss sanity check failed"
assert np.all(np.isclose(D_cpu, D_gpu)), "faiss sanity check failed"
def margin(sample, idx_a, idx_b, D_a_b, D_a, D_b, k, k_extract, threshold):
"""
two datasets: xa, xb; n = number of pairs
idx_a - (np,) - query vector ids in xa
idx_b - (np,) - query vector ids in xb
D_a_b - (np,) - pairwise distances between xa[idx_a] and xb[idx_b]
D_a - (np, k) - distances between vectors xa[idx_a] and corresponding nearest neighbours in xb
D_b - (np, k) - distances between vectors xb[idx_b] and corresponding nearest neighbours in xa
k - k nearest neighbours used for margin
k_extract - number of nearest neighbours of each query in xb we consider for margin calculation and filtering
threshold - margin threshold
"""
n = sample
nk = n * k_extract
assert idx_a.shape == (n,)
idx_a_k = idx_a.repeat(k_extract)
assert idx_a_k.shape == (nk,)
assert idx_b.shape == (nk,)
assert D_a_b.shape == (nk,)
assert D_a.shape == (n, k)
assert D_b.shape == (nk, k)
mean_a = np.mean(D_a, axis=1)
assert mean_a.shape == (n,)
mean_a_k = mean_a.repeat(k_extract)
assert mean_a_k.shape == (nk,)
mean_b = np.mean(D_b, axis=1)
assert mean_b.shape == (nk,)
margin = 2 * D_a_b / (mean_a_k + mean_b)
above_threshold = margin > threshold
print(np.count_nonzero(above_threshold))
print(idx_a_k[above_threshold])
print(idx_b[above_threshold])
print(margin[above_threshold])
return margin
def add_group_args(group, *args, **kwargs):
return group.add_argument(*args, **kwargs)
def get_intersection_cardinality_frequencies(
I: np.ndarray, I_gt: np.ndarray
) -> Dict[int, int]:
"""
Computes the frequencies for the cardinalities of the intersection of neighbour indices.
"""
nq = I.shape[0]
res = []
for ell in range(nq):
res.append(len(np.intersect1d(I[ell, :], I_gt[ell, :])))
values, counts = np.unique(res, return_counts=True)
return dict(zip(values, counts))
def is_pretransform_index(index):
if index.__class__ == faiss.IndexPreTransform:
assert hasattr(index, "chain")
return True
else:
assert not hasattr(index, "chain")
return False

View File

@@ -0,0 +1,13 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project (ROCKSDB_IVF)
set(CMAKE_BUILD_TYPE Debug)
find_package(faiss REQUIRED)
find_package(RocksDB REQUIRED)
add_executable(demo_rocksdb_ivf demo_rocksdb_ivf.cpp RocksDBInvertedLists.cpp)
target_link_libraries(demo_rocksdb_ivf faiss RocksDB::rocksdb)

View File

@@ -0,0 +1,23 @@
# Storing Faiss inverted lists in RocksDB
Demo of storing the inverted lists of any IVF index in RocksDB or any similar key-value store which supports the prefix scan operation.
# How to build
We use conda to create the build environment for simplicity. Only tested on Linux x86.
```
conda create -n rocksdb_ivf
conda activate rocksdb_ivf
conda install pytorch::faiss-cpu conda-forge::rocksdb cmake make gxx_linux-64 sysroot_linux-64
cd ~/faiss/demos/rocksdb_ivf
cmake -B build .
make -C build -j$(nproc)
```
# Run the example
```
cd ~/faiss/demos/rocksdb_ivf/build
./rocksdb_ivf test_db
```

View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "RocksDBInvertedLists.h"
#include <faiss/impl/FaissAssert.h>
using namespace faiss;
namespace faiss_rocksdb {
RocksDBInvertedListsIterator::RocksDBInvertedListsIterator(
rocksdb::DB* db,
size_t list_no,
size_t code_size)
: InvertedListsIterator(),
it(db->NewIterator(rocksdb::ReadOptions())),
list_no(list_no),
code_size(code_size),
codes(code_size) {
it->Seek(rocksdb::Slice(
reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
}
bool RocksDBInvertedListsIterator::is_available() const {
return it->Valid() &&
it->key().starts_with(rocksdb::Slice(
reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
}
void RocksDBInvertedListsIterator::next() {
it->Next();
}
std::pair<idx_t, const uint8_t*> RocksDBInvertedListsIterator::
get_id_and_codes() {
idx_t id =
*reinterpret_cast<const idx_t*>(&it->key().data()[sizeof(size_t)]);
assert(code_size == it->value().size());
return {id, reinterpret_cast<const uint8_t*>(it->value().data())};
}
RocksDBInvertedLists::RocksDBInvertedLists(
const char* db_directory,
size_t nlist,
size_t code_size)
: InvertedLists(nlist, code_size) {
use_iterator = true;
rocksdb::Options options;
options.create_if_missing = true;
rocksdb::DB* db;
rocksdb::Status status = rocksdb::DB::Open(options, db_directory, &db);
db_ = std::unique_ptr<rocksdb::DB>(db);
assert(status.ok());
}
size_t RocksDBInvertedLists::list_size(size_t /*list_no*/) const {
FAISS_THROW_MSG("list_size is not supported");
}
const uint8_t* RocksDBInvertedLists::get_codes(size_t /*list_no*/) const {
FAISS_THROW_MSG("get_codes is not supported");
}
const idx_t* RocksDBInvertedLists::get_ids(size_t /*list_no*/) const {
FAISS_THROW_MSG("get_ids is not supported");
}
size_t RocksDBInvertedLists::add_entries(
size_t list_no,
size_t n_entry,
const idx_t* ids,
const uint8_t* code) {
rocksdb::WriteOptions wo;
std::vector<char> key(sizeof(size_t) + sizeof(idx_t));
memcpy(key.data(), &list_no, sizeof(size_t));
for (size_t i = 0; i < n_entry; i++) {
memcpy(key.data() + sizeof(size_t), ids + i, sizeof(idx_t));
rocksdb::Status status = db_->Put(
wo,
rocksdb::Slice(key.data(), key.size()),
rocksdb::Slice(
reinterpret_cast<const char*>(code + i * code_size),
code_size));
assert(status.ok());
}
return 0; // ignored
}
void RocksDBInvertedLists::update_entries(
size_t /*list_no*/,
size_t /*offset*/,
size_t /*n_entry*/,
const idx_t* /*ids*/,
const uint8_t* /*code*/) {
FAISS_THROW_MSG("update_entries is not supported");
}
void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
FAISS_THROW_MSG("resize is not supported");
}
InvertedListsIterator* RocksDBInvertedLists::get_iterator(
size_t list_no,
void* inverted_list_context) const {
return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
}
} // namespace faiss_rocksdb

View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#pragma once
#include <faiss/invlists/InvertedLists.h>
#include <rocksdb/db.h>
namespace faiss_rocksdb {
struct RocksDBInvertedListsIterator : faiss::InvertedListsIterator {
RocksDBInvertedListsIterator(
rocksdb::DB* db,
size_t list_no,
size_t code_size);
virtual bool is_available() const override;
virtual void next() override;
virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes() override;
private:
std::unique_ptr<rocksdb::Iterator> it;
size_t list_no;
size_t code_size;
std::vector<uint8_t> codes; // buffer for returning codes in next()
};
struct RocksDBInvertedLists : faiss::InvertedLists {
RocksDBInvertedLists(
const char* db_directory,
size_t nlist,
size_t code_size);
size_t list_size(size_t list_no) const override;
const uint8_t* get_codes(size_t list_no) const override;
const faiss::idx_t* get_ids(size_t list_no) const override;
size_t add_entries(
size_t list_no,
size_t n_entry,
const faiss::idx_t* ids,
const uint8_t* code) override;
void update_entries(
size_t list_no,
size_t offset,
size_t n_entry,
const faiss::idx_t* ids,
const uint8_t* code) override;
void resize(size_t list_no, size_t new_size) override;
faiss::InvertedListsIterator* get_iterator(
size_t list_no,
void* inverted_list_context) const override;
private:
std::unique_ptr<rocksdb::DB> db_;
};
} // namespace faiss_rocksdb

View File

@@ -0,0 +1,88 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <exception>
#include <iostream>
#include <memory>
#include "RocksDBInvertedLists.h"
#include <faiss/IndexFlat.h>
#include <faiss/IndexIVFFlat.h>
#include <faiss/impl/AuxIndexStructures.h>
#include <faiss/impl/FaissException.h>
#include <faiss/utils/random.h>
using namespace faiss;
int main(int argc, char* argv[]) {
try {
if (argc != 2) {
std::cerr << "missing db directory argument" << std::endl;
return -1;
}
size_t d = 128;
size_t nlist = 100;
IndexFlatL2 quantizer(d);
IndexIVFFlat index(&quantizer, d, nlist);
faiss_rocksdb::RocksDBInvertedLists ril(
argv[1], nlist, index.code_size);
index.replace_invlists(&ril, false);
idx_t nb = 10000;
std::vector<float> xb(d * nb);
float_rand(xb.data(), d * nb, 12345);
std::vector<idx_t> xids(nb);
std::iota(xids.begin(), xids.end(), 0);
index.train(nb, xb.data());
index.add_with_ids(nb, xb.data(), xids.data());
idx_t nq = 20; // nb;
index.nprobe = 2;
std::cout << "search" << std::endl;
idx_t k = 5;
std::vector<float> distances(nq * k);
std::vector<idx_t> labels(nq * k, -1);
index.search(
nq, xb.data(), k, distances.data(), labels.data(), nullptr);
for (idx_t iq = 0; iq < nq; iq++) {
std::cout << iq << ": ";
for (auto j = 0; j < k; j++) {
std::cout << labels[iq * k + j] << " " << distances[iq * k + j]
<< " | ";
}
std::cout << std::endl;
}
std::cout << std::endl << "range search" << std::endl;
float range = 15.0f;
RangeSearchResult result(nq);
index.range_search(nq, xb.data(), range, &result);
for (idx_t iq = 0; iq < nq; iq++) {
std::cout << iq << ": ";
for (auto j = result.lims[iq]; j < result.lims[iq + 1]; j++) {
std::cout << result.labels[j] << " " << result.distances[j]
<< " | ";
}
std::cout << std::endl;
}
} catch (FaissException& e) {
std::cerr << e.what() << '\n';
} catch (std::exception& e) {
std::cerr << e.what() << '\n';
} catch (...) {
std::cerr << "Unrecognized exception!\n";
}
return 0;
}