Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,272 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import hashlib
import io
import json
import logging
import os
import pickle
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from zipfile import ZipFile
import faiss # @manual=//faiss/python:pyfaiss
import numpy as np
import submitit
from faiss.contrib.datasets import ( # @manual=//faiss/contrib:faiss_contrib
dataset_from_name,
)
logger = logging.getLogger(__name__)
# merge RCQ coarse quantizer and ITQ encoder to one Faiss index
def merge_rcq_itq(
# pyre-ignore[11]: `faiss.ResidualCoarseQuantizer` is not defined as a type
rcq_coarse_quantizer: faiss.ResidualCoarseQuantizer,
itq_encoder: faiss.IndexPreTransform,
# pyre-ignore[11]: `faiss.IndexIVFSpectralHash` is not defined as a type.
) -> faiss.IndexIVFSpectralHash:
# pyre-ignore[16]: `faiss` has no attribute `IndexIVFSpectralHash`.
index = faiss.IndexIVFSpectralHash(
rcq_coarse_quantizer,
rcq_coarse_quantizer.d,
rcq_coarse_quantizer.ntotal,
itq_encoder.sa_code_size() * 8,
1000000, # larger than the magnitude of the vectors
)
index.replace_vt(itq_encoder)
return index
@dataclass
class BenchmarkIO:
path: str # local path
def __init__(self, path: str):
self.path = path
self.cached_ds: Dict[Any, Any] = {}
def clone(self):
return BenchmarkIO(path=self.path)
def get_local_filepath(self, filename):
if len(filename) > 184:
fn, ext = os.path.splitext(filename)
filename = (
fn[:184] + hashlib.sha256(filename.encode()).hexdigest() + ext
)
return os.path.join(self.path, filename)
def get_remote_filepath(self, filename) -> Optional[str]:
return None
def download_file_from_blobstore(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
):
return self.get_local_filepath(filename)
def upload_file_to_blobstore(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
overwrite: bool = False,
):
pass
def file_exist(self, filename: str):
fn = self.get_local_filepath(filename)
exists = os.path.exists(fn)
logger.info(f"{filename} {exists=}")
return exists
def read_file(self, filename: str, keys: List[str]):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading file {fn}")
results = []
with ZipFile(fn, "r") as zip_file:
for key in keys:
with zip_file.open(key, "r") as f:
if key in ["D", "I", "R", "lims"]:
results.append(np.load(f))
elif key in ["P"]:
t = io.TextIOWrapper(f)
results.append(json.load(t))
else:
raise AssertionError()
return results
def write_file(
self,
filename: str,
keys: List[str],
values: List[Any],
overwrite: bool = False,
):
fn = self.get_local_filepath(filename)
with ZipFile(fn, "w") as zip_file:
for key, value in zip(keys, values, strict=True):
with zip_file.open(key, "w", force_zip64=True) as f:
if key in ["D", "I", "R", "lims"]:
np.save(f, value)
elif key in ["P"]:
t = io.TextIOWrapper(f, write_through=True)
json.dump(value, t)
else:
raise AssertionError()
self.upload_file_to_blobstore(filename, overwrite=overwrite)
def get_dataset(self, dataset):
if dataset not in self.cached_ds:
if (
dataset.namespace is not None
and dataset.namespace[:4] == "std_"
):
if dataset.tablename not in self.cached_ds:
self.cached_ds[dataset.tablename] = dataset_from_name(
dataset.tablename,
)
p = dataset.namespace[4]
if p == "t":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_train(dataset.num_vectors)
elif p == "d":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_database()
elif p == "q":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_queries()
else:
raise ValueError
elif dataset.namespace == "syn":
d, seed = dataset.tablename.split("_")
d = int(d)
seed = int(seed)
n = dataset.num_vectors
# based on faiss.contrib.datasets.SyntheticDataset
d1 = 10
rs = np.random.RandomState(seed)
x = rs.normal(size=(n, d1))
x = np.dot(x, rs.rand(d1, d))
x = x * (rs.rand(d) * 4 + 0.1)
x = np.sin(x)
x = x.astype(np.float32)
self.cached_ds[dataset] = x
else:
self.cached_ds[dataset] = self.read_nparray(
os.path.join(self.path, dataset.tablename),
mmap_mode="r",
)[: dataset.num_vectors].copy()
return self.cached_ds[dataset]
def read_nparray(
self,
filename: str,
mmap_mode: Optional[str] = None,
):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading nparray from {fn}")
nparray = np.load(fn, mmap_mode=mmap_mode)
logger.info(f"Loaded nparray {nparray.shape} from {fn}")
return nparray
def write_nparray(
self,
nparray: np.ndarray,
filename: str,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving nparray {nparray.shape} to {fn}")
np.save(fn, nparray)
self.upload_file_to_blobstore(filename)
def read_json(
self,
filename: str,
):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading json {fn}")
with open(fn, "r") as fp:
json_dict = json.load(fp)
logger.info(f"Loaded json {json_dict} from {fn}")
return json_dict
def write_json(
self,
json_dict: dict[str, Any],
filename: str,
overwrite: bool = False,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving json {json_dict} to {fn}")
with open(fn, "w") as fp:
json.dump(json_dict, fp)
self.upload_file_to_blobstore(filename, overwrite=overwrite)
def read_index(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
):
fn = self.download_file_from_blobstore(filename, bucket, path)
logger.info(f"Loading index {fn}")
ext = os.path.splitext(fn)[1]
if ext in [".faiss", ".codec", ".index"]:
index = faiss.read_index(fn)
elif ext == ".pkl":
with open(fn, "rb") as model_file:
model = pickle.load(model_file)
rcq_coarse_quantizer, itq_encoder = model["model"]
index = merge_rcq_itq(rcq_coarse_quantizer, itq_encoder)
logger.info(f"Loaded index from {fn}")
return index
def write_index(
self,
index: faiss.Index,
filename: str,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving index to {fn}")
faiss.write_index(index, fn)
self.upload_file_to_blobstore(filename)
assert os.path.exists(fn)
return os.path.getsize(fn)
def launch_jobs(self, func, params, local=True):
if local:
results = [func(p) for p in params]
return results
logger.info(f"launching {len(params)} jobs")
executor = submitit.AutoExecutor(folder="/checkpoint/gsz/jobs")
executor.update_parameters(
nodes=1,
gpus_per_node=8,
cpus_per_task=80,
# mem_gb=640,
tasks_per_node=1,
name="faiss_benchmark",
slurm_array_parallelism=512,
slurm_partition="scavenge",
slurm_time=4 * 60,
slurm_constraint="bldg1",
)
jobs = executor.map_array(func, params)
logger.info(f"launched {len(jobs)} jobs")
for job, param in zip(jobs, params):
logger.info(f"{job.job_id=} {param[0]=}")
results = [job.result() for job in jobs]
print(f"received {len(results)} results")
return results

View File

@@ -0,0 +1,379 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import faiss # @manual=//faiss/python:pyfaiss
from .benchmark_io import BenchmarkIO
from .utils import timer
logger = logging.getLogger(__name__)
# Important: filenames end with . without extension (npy, codec, index),
# when writing files, you are required to filename + "npy" etc.
@dataclass
class IndexDescriptorClassic:
bucket: Optional[str] = None
# either path or factory should be set,
# but not both at the same time.
path: Optional[str] = None
factory: Optional[str] = None
codec_alias: Optional[str] = None
construction_params: Optional[List[Dict[str, int]]] = None
search_params: Optional[Dict[str, int]] = None
# range metric definitions
# key: name
# value: one of the following:
#
# radius
# [0..radius) -> 1
# [radius..inf) -> 0
#
# [[radius1, score1], ...]
# [0..radius1) -> score1
# [radius1..radius2) -> score2
#
# [[radius1_from, radius1_to, score1], ...]
# [radius1_from, radius1_to) -> score1,
# [radius2_from, radius2_to) -> score2
range_metrics: Optional[Dict[str, Any]] = None
radius: Optional[float] = None
training_size: Optional[int] = None
def __hash__(self):
return hash(str(self))
@dataclass
class DatasetDescriptor:
# namespace possible values:
# 1. a hive namespace
# 2. 'std_t', 'std_d', 'std_q' for the standard datasets
# via faiss.contrib.datasets.dataset_from_name()
# t - training, d - database, q - queries
# eg. "std_t"
# 3. 'syn' for synthetic data
# 4. None for local files
namespace: Optional[str] = None
# tablename possible values, corresponding to the
# namespace value above:
# 1. a hive table name
# 2. name of the standard dataset as recognized
# by faiss.contrib.datasets.dataset_from_name()
# eg. "bigann1M"
# 3. d_seed, eg. 128_1234 for 128 dimensional vectors
# with seed 1234
# 4. a local file name (relative to benchmark_io.path)
tablename: Optional[str] = None
# partition names and values for hive
# eg. ["ds=2021-09-01"]
partitions: Optional[List[str]] = None
# number of vectors to load from the dataset
num_vectors: Optional[int] = None
embedding_column: Optional[str] = None
# only when the embedding column is a map
embedding_column_key: Optional[Any] = None
embedding_id_column: Optional[str] = None
# filters on the dataset where each filter is a
# string rep of a filter expression
filters: Optional[List[str]] = None
# unused in open-source
splits_distribution: Optional[List[List[bytes]]] = None
# unused in open-source
splits: Optional[List[bytes]] = None
# unused in open-source
serialized_df: Optional[str] = None
sampling_rate: Optional[float] = None
# sampling column for xdb
sampling_column: Optional[str] = None
# blob store
bucket: Optional[str] = None
path: Optional[str] = None
# desc_name
desc_name: Optional[str] = None
normalize_L2: bool = False
def __hash__(self):
return hash(self.get_filename())
def get_filename(
self,
prefix: Optional[str] = None,
) -> str:
if self.desc_name is not None:
return self.desc_name
filename = ""
if prefix is not None:
filename += prefix + "_"
if self.namespace is not None:
filename += self.namespace + "_"
assert self.tablename is not None
filename += self.tablename
if self.partitions is not None:
filename += "_" + "_".join(
self.partitions
).replace("=", "_").replace("/", "_")
if self.num_vectors is not None:
filename += f"_{self.num_vectors}"
filename += "."
self.desc_name = filename
return self.desc_name
def get_kmeans_filename(self, k):
return f"{self.get_filename()}kmeans_{k}."
def k_means(self, io, k, dry_run):
logger.info(f"k_means {k} {self}")
kmeans_vectors = DatasetDescriptor(
tablename=f"{self.get_filename()}kmeans_{k}"
)
kmeans_filename = kmeans_vectors.get_filename() + "npy"
meta_filename = kmeans_vectors.get_filename() + "json"
if not io.file_exist(kmeans_filename) or not io.file_exist(
meta_filename
):
if dry_run:
return None, None, kmeans_filename
x = io.get_dataset(self)
kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
_, t, _ = timer("k_means", lambda: kmeans.train(x))
io.write_nparray(kmeans.centroids, kmeans_filename)
io.write_json({"k_means_time": t}, meta_filename)
else:
t = io.read_json(meta_filename)["k_means_time"]
return kmeans_vectors, t, None
@dataclass
class IndexBaseDescriptor:
d: int
metric: str
desc_name: Optional[str] = None
flat_desc_name: Optional[str] = None
bucket: Optional[str] = None
path: Optional[str] = None
num_threads: int = 1
def get_name(self) -> str:
raise NotImplementedError()
def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
if self.path is not None:
return self.path
self.path = benchmark_io.get_remote_filepath(self.desc_name)
return self.path
@staticmethod
def param_dict_list_to_name(param_dict_list):
if not param_dict_list:
return ""
l = 0
n = ""
for param_dict in param_dict_list:
n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
l += 1
return n
@staticmethod
def param_dict_to_name(param_dict, prefix="sp"):
if not param_dict:
return ""
n = prefix
for name, val in param_dict.items():
if name == "snap":
continue
if name == "lsq_gpu" and val == 0:
continue
if name == "use_beam_LUT" and val == 0:
continue
n += f"_{name}_{val}"
if n == prefix:
return ""
n += "."
return n
@dataclass
class CodecDescriptor(IndexBaseDescriptor):
# either path or factory should be set,
# but not both at the same time.
factory: Optional[str] = None
construction_params: Optional[List[Dict[str, int]]] = None
training_vectors: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xt"
def __post_init__(self):
self.get_name()
def is_trained(self):
return self.factory is None and self.path is not None
def is_valid(self):
return self.factory is not None or self.path is not None
def get_name(self) -> str:
if self.desc_name is not None:
return self.desc_name
if self.factory is not None:
self.desc_name = self.name_from_factory()
return self.desc_name
if self.path is not None:
self.desc_name = self.name_from_path()
return self.desc_name
raise ValueError("name, factory or path must be set")
def flat_name(self) -> str:
if self.flat_desc_name is not None:
return self.flat_desc_name
self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
return self.flat_desc_name
def path(self, benchmark_io) -> str:
if self.path is not None:
return self.path
return benchmark_io.get_remote_filepath(self.get_name())
def name_from_factory(self) -> str:
assert self.factory is not None
name = f"{self.factory.replace(',', '_')}."
assert self.d is not None
assert self.metric is not None
name += f"d_{self.d}.{self.metric.upper()}."
if self.factory != "Flat":
assert self.training_vectors is not None
name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX)
name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
return name
def name_from_path(self):
assert self.path is not None
filename = os.path.basename(self.path)
ext = filename.split(".")[-1]
if filename.endswith(ext):
name = filename[:-len(ext)]
else: # should never hit this rather raise value error
name = filename
return name
def alias(self, benchmark_io: BenchmarkIO):
if hasattr(benchmark_io, "bucket"):
return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
@dataclass
class IndexDescriptor(IndexBaseDescriptor):
codec_desc: Optional[CodecDescriptor] = None
database_desc: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xb"
def __hash__(self):
return hash(str(self))
def __post_init__(self):
self.get_name()
def is_built(self):
return self.codec_desc is None and self.database_desc is None
def get_name(self) -> str:
if self.desc_name is None:
self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
return self.desc_name
def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
return self.flat_desc_name
# alias is used to refer when index is uploaded to blobstore and refered again
def alias(self, benchmark_io: BenchmarkIO):
if hasattr(benchmark_io, "bucket"):
return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
@dataclass
class KnnDescriptor(IndexBaseDescriptor):
index_desc: Optional[IndexDescriptor] = None
gt_index_desc: Optional[IndexDescriptor] = None
query_dataset: Optional[DatasetDescriptor] = None
search_params: Optional[Dict[str, int]] = None
reconstruct: bool = False
FILENAME_PREFIX: str = "q"
# range metric definitions
# key: name
# value: one of the following:
#
# radius
# [0..radius) -> 1
# [radius..inf) -> 0
#
# [[radius1, score1], ...]
# [0..radius1) -> score1
# [radius1..radius2) -> score2
#
# [[radius1_from, radius1_to, score1], ...]
# [radius1_from, radius1_to) -> score1,
# [radius2_from, radius2_to) -> score2
range_metrics: Optional[Dict[str, Any]] = None
radius: Optional[float] = None
k: int = 1
range_ref_index_desc: Optional[str] = None
def __hash__(self):
return hash(str(self))
def get_name(self):
if self.desc_name is not None:
return self.desc_name
name = self.index_desc.get_name()
name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
name += "rec."
else:
name += "knn."
self.desc_name = name
return name
def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
name = self.index_desc.flat_name()
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
name += "rec."
else:
name += "knn."
self.flat_desc_name = name
return name

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,335 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from dataclasses import dataclass
from typing import Dict, List, Tuple
import faiss # @manual=//faiss/python:pyfaiss
# from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib
# OperatingPoints,
# )
from .benchmark import Benchmark
from .descriptors import DatasetDescriptor, IndexDescriptorClassic
from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
logger = logging.getLogger(__name__)
@dataclass
class Optimizer:
distance_metric: str = "L2"
num_threads: int = 32
run_local: bool = True
def __post_init__(self):
self.cached_benchmark = None
if self.distance_metric == "IP":
self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
elif self.distance_metric == "L2":
self.distance_metric_type = faiss.METRIC_L2
else:
raise ValueError
def set_io(self, benchmark_io):
self.io = benchmark_io
self.io.distance_metric = self.distance_metric
self.io.distance_metric_type = self.distance_metric_type
def benchmark_and_filter_candidates(
self,
index_descs,
training_vectors,
database_vectors,
query_vectors,
result_file,
include_flat,
min_accuracy,
pareto_metric,
):
benchmark = Benchmark(
num_threads=self.num_threads,
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
index_descs=index_descs,
k=10,
distance_metric=self.distance_metric,
)
benchmark.set_io(self.io)
results = benchmark.benchmark(
result_file=result_file, local=self.run_local, train=True, knn=True
)
assert results
filtered = filter_results(
results=results,
evaluation="knn",
accuracy_metric="knn_intersection",
min_accuracy=min_accuracy,
name_filter=None
if include_flat
else (lambda n: not n.startswith("Flat")),
pareto_mode=ParetoMode.GLOBAL,
pareto_metric=pareto_metric,
)
assert filtered
index_descs = [
IndexDescriptorClassic(
factory=v["factory"],
construction_params=v["construction_params"],
search_params=v["search_params"],
)
for _, _, _, _, v in filtered
]
return index_descs, filtered
def optimize_quantizer(
self,
training_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlists: List[int],
min_accuracy: float,
):
quantizer_descs = {}
for nlist in nlists:
# cluster
centroids, _, _ = training_vectors.k_means(
self.io,
nlist,
dry_run=False,
)
descs = [IndexDescriptorClassic(factory="Flat"),] + [
IndexDescriptorClassic(
factory="HNSW32",
construction_params=[{"efConstruction": 2**i}],
)
for i in range(6, 11)
]
descs, _ = self.benchmark_and_filter_candidates(
descs,
training_vectors=centroids,
database_vectors=centroids,
query_vectors=query_vectors,
result_file=f"result_{centroids.get_filename()}json",
include_flat=True,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME,
)
quantizer_descs[nlist] = descs
return quantizer_descs
def optimize_ivf(
self,
result_file: str,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
quantizers: Dict[int, List[IndexDescriptorClassic]],
codecs: List[Tuple[str, str]],
min_accuracy: float,
):
ivf_descs = []
for nlist, quantizer_descs in quantizers.items():
# build IVF index
for quantizer_desc in quantizer_descs:
for pretransform, fine_ivf in codecs:
if pretransform is None:
pretransform = ""
else:
pretransform = pretransform + ","
if quantizer_desc.construction_params is None:
construction_params = [
None,
quantizer_desc.search_params,
]
else:
construction_params = [
None
] + quantizer_desc.construction_params
if quantizer_desc.search_params is not None:
dict_merge(
construction_params[1],
quantizer_desc.search_params,
)
ivf_descs.append(
IndexDescriptorClassic(
factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
construction_params=construction_params,
)
)
return self.benchmark_and_filter_candidates(
ivf_descs,
training_vectors,
database_vectors,
query_vectors,
result_file,
include_flat=False,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME_SPACE,
)
# train an IVFFlat index
# find the nprobe required for the given accuracy
def ivf_flat_nprobe_required_for_accuracy(
self,
result_file: str,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlist,
accuracy,
):
_, results = self.benchmark_and_filter_candidates(
index_descs=[
IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
],
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
result_file=result_file,
include_flat=False,
min_accuracy=accuracy,
pareto_metric=ParetoMetric.TIME,
)
nprobe = nlist // 2
for _, _, _, k, v in results:
if (
".knn" in k
and "nprobe" in v["search_params"]
and v["knn_intersection"] >= accuracy
):
nprobe = min(nprobe, v["search_params"]["nprobe"])
return nprobe
# train candidate IVF codecs
# benchmark them at the same nprobe
# keep only the space _and_ time Pareto optimal
def optimize_codec(
self,
result_file: str,
d: int,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlist: int,
nprobe: int,
min_accuracy: float,
):
codecs = (
[
(None, "Flat"),
(None, "SQfp16"),
(None, "SQbf16"),
(None, "SQ8"),
(None, "SQ8_direct_signed"),
] + [
(f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
if d % M == 0
for dim in range(2, 18, 2)
if M * dim <= d
for b in range(4, 14, 2)
if M * b < d * 8 # smaller than SQ8
] + [
(None, f"PQ{M}x{b}")
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
if d % M == 0
for b in range(8, 14, 2)
if M * b < d * 8 # smaller than SQ8
]
)
factory = {}
for opq, pq in codecs:
factory[
f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
] = (
opq,
pq,
)
_, filtered = self.benchmark_and_filter_candidates(
index_descs=[
IndexDescriptorClassic(
factory=f"IVF{nlist},{pq}"
if opq is None
else f"{opq},IVF{nlist},{pq}",
search_params={
"nprobe": nprobe,
},
)
for opq, pq in codecs
],
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
result_file=result_file,
include_flat=False,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME_SPACE,
)
results = [
factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
]
return results
def optimize(
self,
d: int,
training_vectors: DatasetDescriptor,
database_vectors_list: List[DatasetDescriptor],
query_vectors: DatasetDescriptor,
min_accuracy: float,
):
# train an IVFFlat index
# find the nprobe required for near perfect accuracy
nlist = 4096
nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
result_file=f"result_ivf{nlist}_flat.json",
training_vectors=training_vectors,
database_vectors=database_vectors_list[0],
query_vectors=query_vectors,
nlist=nlist,
accuracy=0.95,
)
# train candidate IVF codecs
# benchmark them at the same nprobe
# keep only the space and time Pareto optima
codecs = self.optimize_codec(
result_file=f"result_ivf{nlist}_codec.json",
d=d,
training_vectors=training_vectors,
database_vectors=database_vectors_list[0],
query_vectors=query_vectors,
nlist=nlist,
nprobe=nprobe_at_95,
min_accuracy=min_accuracy,
)
# optimize coarse quantizers
quantizers = self.optimize_quantizer(
training_vectors=training_vectors,
query_vectors=query_vectors,
nlists=[4096, 8192, 16384, 32768],
min_accuracy=0.7,
)
# combine them with the codecs
# test them at different scales
for database_vectors in database_vectors_list:
self.optimize_ivf(
result_file=f"result_{database_vectors.get_filename()}json",
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
quantizers=quantizers,
codecs=codecs,
min_accuracy=min_accuracy,
)

View File

@@ -0,0 +1,248 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import functools
import logging
from enum import Enum
from multiprocessing.pool import ThreadPool
from time import perf_counter
import faiss # @manual=//faiss/python:pyfaiss
import numpy as np
from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib
OperatingPoints,
)
logger = logging.getLogger(__name__)
def timer(name, func, once=False) -> float:
logger.info(f"Measuring {name}")
t1 = perf_counter()
res = func()
t2 = perf_counter()
t = t2 - t1
repeat = 1
if not once and t < 1.0:
repeat = int(2.0 // t)
logger.info(
f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
)
t1 = perf_counter()
for _ in range(repeat):
res = func()
t2 = perf_counter()
t = (t2 - t1) / repeat
logger.info(f"Time for {name}: {t:.3f} seconds")
return res, t, repeat
def refine_distances_knn(
xq: np.ndarray,
xb: np.ndarray,
I: np.ndarray,
metric,
):
"""Recompute distances between xq[i] and xb[I[i, :]]"""
nq, k = I.shape
xq = np.ascontiguousarray(xq, dtype="float32")
nq2, d = xq.shape
xb = np.ascontiguousarray(xb, dtype="float32")
nb, d2 = xb.shape
I = np.ascontiguousarray(I, dtype="int64")
assert nq2 == nq
assert d2 == d
D = np.empty(I.shape, dtype="float32")
D[:] = np.inf
if metric == faiss.METRIC_L2:
faiss.fvec_L2sqr_by_idx(
faiss.swig_ptr(D),
faiss.swig_ptr(xq),
faiss.swig_ptr(xb),
faiss.swig_ptr(I),
d,
nq,
k,
)
else:
faiss.fvec_inner_products_by_idx(
faiss.swig_ptr(D),
faiss.swig_ptr(xq),
faiss.swig_ptr(xb),
faiss.swig_ptr(I),
d,
nq,
k,
)
return D
def refine_distances_range(
lims: np.ndarray,
D: np.ndarray,
I: np.ndarray,
xq: np.ndarray,
xb: np.ndarray,
metric,
):
with ThreadPool(32) as pool:
R = pool.map(
lambda i: (
np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
if metric == faiss.METRIC_L2
else np.tensordot(
xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
)
)
if lims[i + 1] > lims[i]
else [],
range(len(lims) - 1),
)
return np.hstack(R)
def distance_ratio_measure(I, R, D_GT, metric):
sum_of_R = np.sum(np.where(I >= 0, R, 0))
sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
if metric == faiss.METRIC_INNER_PRODUCT:
return (sum_of_R / sum_of_D_GT).item()
elif metric == faiss.METRIC_L2:
return (sum_of_D_GT / sum_of_R).item()
else:
raise RuntimeError(f"unknown metric {metric}")
@functools.cache
def get_cpu_info():
return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
13:
].strip()
def dict_merge(target, source):
for k, v in source.items():
if isinstance(v, dict) and k in target:
dict_merge(target[k], v)
else:
target[k] = v
class Cost:
def __init__(self, values):
self.values = values
def __le__(self, other):
return all(
v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
)
def __lt__(self, other):
return all(
v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
)
class ParetoMode(Enum):
DISABLE = 1 # no Pareto filtering
INDEX = 2 # index-local optima
GLOBAL = 3 # global optima
class ParetoMetric(Enum):
TIME = 0 # time vs accuracy
SPACE = 1 # space vs accuracy
TIME_SPACE = 2 # (time, space) vs accuracy
def range_search_recall_at_precision(experiment, precision):
return round(
max(
r
for r, p in zip(
experiment["range_search_pr"]["recall"],
experiment["range_search_pr"]["precision"],
)
if p > precision
),
6,
)
def filter_results(
results,
evaluation,
accuracy_metric, # str or func
time_metric=None, # func or None -> use default
space_metric=None, # func or None -> use default
min_accuracy=0,
max_space=0,
max_time=0,
scaling_factor=1.0,
name_filter=None, # func
pareto_mode=ParetoMode.DISABLE,
pareto_metric=ParetoMetric.TIME,
):
if isinstance(accuracy_metric, str):
accuracy_key = accuracy_metric
accuracy_metric = lambda v: v[accuracy_key]
if time_metric is None:
time_metric = lambda v: v["time"] * scaling_factor + (
v["quantizer"]["time"] if "quantizer" in v else 0
)
if space_metric is None:
space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
fe = []
ops = {}
if pareto_mode == ParetoMode.GLOBAL:
op = OperatingPoints()
ops["global"] = op
for k, v in results["experiments"].items():
if f".{evaluation}" in k:
accuracy = accuracy_metric(v)
if min_accuracy > 0 and accuracy < min_accuracy:
continue
space = space_metric(v)
if space is None:
space = 0
if max_space > 0 and space > max_space:
continue
time = time_metric(v)
if max_time > 0 and time > max_time:
continue
idx_name = v["index"] + (
"snap"
if "search_params" in v and v["search_params"]["snap"] == 1
else ""
)
if name_filter is not None and not name_filter(idx_name):
continue
experiment = (accuracy, space, time, k, v)
if pareto_mode == ParetoMode.DISABLE:
fe.append(experiment)
continue
if pareto_mode == ParetoMode.INDEX:
if idx_name not in ops:
ops[idx_name] = OperatingPoints()
op = ops[idx_name]
if pareto_metric == ParetoMetric.TIME:
op.add_operating_point(experiment, accuracy, time)
elif pareto_metric == ParetoMetric.SPACE:
op.add_operating_point(experiment, accuracy, space)
else:
op.add_operating_point(
experiment, accuracy, Cost([time, space])
)
if ops:
for op in ops.values():
for v, _, _ in op.operating_points:
fe.append(v)
fe.sort()
return fe