Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
# 最终简化版
cmake_minimum_required(VERSION 3.24)
project(leann_backend_hnsw_wrapper)
set(FAISS_ENABLE_PYTHON ON CACHE BOOL "" FORCE)
set(FAISS_ENABLE_GPU OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_EXTRAS OFF CACHE BOOL "" FORCE)
set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE)
set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
add_subdirectory(third_party/faiss)

View File

@@ -0,0 +1 @@
from . import hnsw_backend

View File

@@ -0,0 +1,313 @@
import numpy as np
import os
import json
import struct
from pathlib import Path
from typing import Dict
import contextlib
import threading
import time
import atexit
import socket
import subprocess
import sys
# 文件: packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
# ... (其他 import 保持不变) ...
from leann.registry import register_backend
from leann.interface import (
LeannBackendFactoryInterface,
LeannBackendBuilderInterface,
LeannBackendSearcherInterface
)
def get_metric_map():
from . import faiss
return {
"mips": faiss.METRIC_INNER_PRODUCT,
"l2": faiss.METRIC_L2,
"cosine": faiss.METRIC_INNER_PRODUCT, # Will need normalization
}
def _check_port(port: int) -> bool:
"""Check if a port is in use"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0
class HNSWEmbeddingServerManager:
"""
HNSW-specific embedding server manager that handles the lifecycle of the embedding server process.
Mirrors the DiskANN EmbeddingServerManager architecture.
"""
def __init__(self):
self.server_process = None
self.server_port = None
atexit.register(self.stop_server)
def start_server(self, port=5556, model_name="sentence-transformers/all-mpnet-base-v2", passages_file=None):
"""
Start the HNSW embedding server process.
Args:
port: ZMQ port for the server
model_name: Name of the embedding model to use
passages_file: Optional path to passages JSON file
"""
if self.server_process and self.server_process.poll() is None:
print(f"INFO: Reusing existing HNSW server process for this session (PID {self.server_process.pid})")
return True
# Check if port is already in use
if _check_port(port):
print(f"WARNING: Port {port} is already in use. Assuming an external HNSW server is running and connecting to it.")
return True
print(f"INFO: Starting session-level HNSW embedding server as a background process...")
try:
command = [
sys.executable,
"-m", "packages.leann-backend-hnsw.src.leann_backend_hnsw.hnsw_embedding_server",
"--zmq-port", str(port),
"--model-name", model_name
]
# Add passages file if provided
if passages_file:
command.extend(["--passages-file", str(passages_file)])
project_root = Path(__file__).parent.parent.parent.parent
print(f"INFO: Running HNSW command from project root: {project_root}")
self.server_process = subprocess.Popen(
command,
cwd=project_root,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8'
)
self.server_port = port
print(f"INFO: HNSW server process started with PID: {self.server_process.pid}")
max_wait, wait_interval = 30, 0.5
for _ in range(int(max_wait / wait_interval)):
if _check_port(port):
print(f"✅ HNSW embedding server is up and ready for this session.")
log_thread = threading.Thread(target=self._log_monitor, daemon=True)
log_thread.start()
return True
if self.server_process.poll() is not None:
print("❌ ERROR: HNSW server process terminated unexpectedly during startup.")
self._log_monitor()
return False
time.sleep(wait_interval)
print(f"❌ ERROR: HNSW server process failed to start listening within {max_wait} seconds.")
self.stop_server()
return False
except Exception as e:
print(f"❌ ERROR: Failed to start HNSW embedding server process: {e}")
return False
def _log_monitor(self):
"""Monitor server logs"""
if not self.server_process:
return
try:
if self.server_process.stdout:
for line in iter(self.server_process.stdout.readline, ''):
print(f"[HNSWEmbeddingServer LOG]: {line.strip()}")
self.server_process.stdout.close()
if self.server_process.stderr:
for line in iter(self.server_process.stderr.readline, ''):
print(f"[HNSWEmbeddingServer ERROR]: {line.strip()}")
self.server_process.stderr.close()
except Exception as e:
print(f"HNSW Log monitor error: {e}")
def stop_server(self):
"""Stop the HNSW embedding server process"""
if self.server_process and self.server_process.poll() is None:
print(f"INFO: Terminating HNSW session server process (PID: {self.server_process.pid})...")
self.server_process.terminate()
try:
self.server_process.wait(timeout=5)
print("INFO: HNSW server process terminated.")
except subprocess.TimeoutExpired:
print("WARNING: HNSW server process did not terminate gracefully, killing it.")
self.server_process.kill()
self.server_process = None
@register_backend("hnsw")
class HNSWBackend(LeannBackendFactoryInterface):
@staticmethod
def builder(**kwargs) -> LeannBackendBuilderInterface:
return HNSWBuilder(**kwargs)
@staticmethod
def searcher(index_path: str, **kwargs) -> LeannBackendSearcherInterface:
path = Path(index_path)
meta_path = path.parent / f"{path.stem}.hnsw.meta.json"
if not meta_path.exists():
raise FileNotFoundError(f"Leann metadata file not found at {meta_path}. Cannot infer vector dimension for searcher.")
with open(meta_path, 'r') as f:
meta = json.load(f)
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(meta.get("embedding_model"))
dimensions = model.get_sentence_embedding_dimension()
kwargs['dimensions'] = dimensions
except ImportError:
raise ImportError("sentence-transformers is required to infer embedding dimensions. Please install it.")
except Exception as e:
raise RuntimeError(f"Could not load SentenceTransformer model to get dimension: {e}")
return HNSWSearcher(index_path, **kwargs)
class HNSWBuilder(LeannBackendBuilderInterface):
def __init__(self, **kwargs):
self.build_params = kwargs
def build(self, data: np.ndarray, index_path: str, **kwargs):
"""Build HNSW index using FAISS"""
from . import faiss
path = Path(index_path)
index_dir = path.parent
index_prefix = path.stem
index_dir.mkdir(parents=True, exist_ok=True)
if data.dtype != np.float32:
data = data.astype(np.float32)
if not data.flags['C_CONTIGUOUS']:
data = np.ascontiguousarray(data)
build_kwargs = {**self.build_params, **kwargs}
metric_str = build_kwargs.get("distance_metric", "mips").lower()
metric_enum = get_metric_map().get(metric_str)
if metric_enum is None:
raise ValueError(f"Unsupported distance_metric '{metric_str}'.")
# HNSW parameters
M = build_kwargs.get("M", 32) # Max connections per layer
efConstruction = build_kwargs.get("efConstruction", 200) # Size of the dynamic candidate list for construction
dim = data.shape[1]
print(f"INFO: Building HNSW index for {data.shape[0]} vectors with metric {metric_enum}...")
try:
# Create HNSW index
if metric_enum == faiss.METRIC_INNER_PRODUCT:
index = faiss.IndexHNSWFlat(dim, M, metric_enum)
else: # L2
index = faiss.IndexHNSWFlat(dim, M, metric_enum)
# Set construction parameters
index.hnsw.efConstruction = efConstruction
# Normalize vectors if using cosine similarity
if metric_str == "cosine":
faiss.normalize_L2(data)
# Add vectors to index
index.add(data.shape[0], faiss.swig_ptr(data))
# Save index
index_file = index_dir / f"{index_prefix}.index"
faiss.write_index(index, str(index_file))
print(f"✅ HNSW index built successfully at '{index_file}'")
except Exception as e:
print(f"💥 ERROR: HNSW index build failed. Exception: {e}")
raise
class HNSWSearcher(LeannBackendSearcherInterface):
def __init__(self, index_path: str, **kwargs):
from . import faiss
path = Path(index_path)
index_dir = path.parent
index_prefix = path.stem
metric_str = kwargs.get("distance_metric", "mips").lower()
metric_enum = get_metric_map().get(metric_str)
if metric_enum is None:
raise ValueError(f"Unsupported distance_metric '{metric_str}'.")
dimensions = kwargs.get("dimensions")
if not dimensions:
raise ValueError("Vector dimension not provided to HNSWSearcher.")
try:
# Load FAISS HNSW index
index_file = index_dir / f"{index_prefix}.index"
if not index_file.exists():
raise FileNotFoundError(f"HNSW index file not found at {index_file}")
self._index = faiss.read_index(str(index_file))
self.metric_str = metric_str
self.embedding_server_manager = HNSWEmbeddingServerManager()
print("✅ HNSW index loaded successfully.")
except Exception as e:
print(f"💥 ERROR: Failed to load HNSW index. Exception: {e}")
raise
def search(self, query: np.ndarray, top_k: int, **kwargs) -> Dict[str, any]:
"""Search using HNSW index with optional recompute functionality"""
ef = kwargs.get("ef", 200) # Size of the dynamic candidate list for search
# Recompute parameters
recompute_neighbor_embeddings = kwargs.get("recompute_neighbor_embeddings", False)
zmq_port = kwargs.get("zmq_port", 5556)
embedding_model = kwargs.get("embedding_model", "sentence-transformers/all-mpnet-base-v2")
passages_file = kwargs.get("passages_file", None)
if recompute_neighbor_embeddings:
print(f"INFO: HNSW ZMQ mode enabled - ensuring embedding server is running")
if not self.embedding_server_manager.start_server(zmq_port, embedding_model, passages_file):
print(f"WARNING: Failed to start HNSW embedding server, falling back to standard search")
kwargs['recompute_neighbor_embeddings'] = False
if query.dtype != np.float32:
query = query.astype(np.float32)
if query.ndim == 1:
query = np.expand_dims(query, axis=0)
# Normalize query if using cosine similarity
if self.metric_str == "cosine":
faiss.normalize_L2(query)
try:
# Set search parameter
self._index.hnsw.efSearch = ef
if recompute_neighbor_embeddings:
# Use custom search with recompute
# This would require implementing custom HNSW search logic
# For now, we'll fall back to standard search
print("WARNING: Recompute functionality for HNSW not yet implemented, using standard search")
distances, labels = self._index.search(query, top_k)
else:
# Standard FAISS search
distances, labels = self._index.search(query, top_k)
return {"labels": labels, "distances": distances}
except Exception as e:
print(f"💥 ERROR: HNSW search failed. Exception: {e}")
batch_size = query.shape[0]
return {"labels": np.full((batch_size, top_k), -1, dtype=np.int64),
"distances": np.full((batch_size, top_k), float('inf'), dtype=np.float32)}
def __del__(self):
if hasattr(self, 'embedding_server_manager'):
self.embedding_server_manager.stop_server()

View File

@@ -0,0 +1,583 @@
#!/usr/bin/env python3
"""
HNSW-specific embedding server with removed config.py dependencies
Based on DiskANN embedding server architecture
"""
import pickle
import argparse
import threading
import time
from transformers import AutoTokenizer, AutoModel
import os
from contextlib import contextmanager
import zmq
import numpy as np
import msgpack
import json
from pathlib import Path
from typing import Dict, Any, Optional, Union
RED = "\033[91m"
RESET = "\033[0m"
def is_similarity_metric():
"""
Check if the metric type is similarity-based (like inner product).
0 = L2 (distance metric), 1 = Inner Product (similarity metric)
"""
return True # 1 is METRIC_INNER_PRODUCT in FAISS
# Function for E5-style average pooling
import torch
from torch import Tensor
import torch.nn.functional as F
def e5_average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
class SimplePassageLoader:
"""
Simple passage loader that replaces config.py dependencies
"""
def __init__(self, passages_data: Optional[Dict[str, Any]] = None):
self.passages_data = passages_data or {}
def __getitem__(self, passage_id: Union[str, int]) -> Dict[str, str]:
"""Get passage by ID"""
str_id = str(passage_id)
if str_id in self.passages_data:
return {"text": self.passages_data[str_id]}
else:
# Return empty text for missing passages
return {"text": ""}
def __len__(self) -> int:
return len(self.passages_data)
def load_passages_from_file(passages_file: str) -> SimplePassageLoader:
"""
Load passages from a JSON file
Expected format: {"passage_id": "passage_text", ...}
"""
if not os.path.exists(passages_file):
print(f"Warning: Passages file {passages_file} not found. Using empty loader.")
return SimplePassageLoader()
try:
with open(passages_file, 'r', encoding='utf-8') as f:
passages_data = json.load(f)
print(f"Loaded {len(passages_data)} passages from {passages_file}")
return SimplePassageLoader(passages_data)
except Exception as e:
print(f"Error loading passages from {passages_file}: {e}")
return SimplePassageLoader()
def create_hnsw_embedding_server(
passages_file: Optional[str] = None,
passages_data: Optional[Dict[str, str]] = None,
embeddings_file: Optional[str] = None,
use_fp16: bool = True,
use_int8: bool = False,
use_cuda_graphs: bool = False,
zmq_port: int = 5555,
max_batch_size: int = 128,
model_name: str = "sentence-transformers/all-mpnet-base-v2",
custom_max_length_param: Optional[int] = None,
):
"""
Create and start a ZMQ-based embedding server for HNSW backend.
Args:
passages_file: Path to JSON file containing passage ID -> text mapping
passages_data: Direct passage data dict (alternative to passages_file)
embeddings_file: Path to pre-computed embeddings file (optional)
use_fp16: Whether to use FP16 precision
use_int8: Whether to use INT8 quantization
use_cuda_graphs: Whether to use CUDA graphs
zmq_port: ZMQ port to bind to
max_batch_size: Maximum batch size for processing
model_name: Transformer model name
custom_max_length_param: Custom max sequence length
"""
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# Device setup
mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
cuda_available = torch.cuda.is_available()
print(f"MPS available: {mps_available}")
print(f"CUDA available: {cuda_available}")
if cuda_available:
device = torch.device("cuda")
print("Using CUDA device")
elif mps_available:
device = torch.device("mps")
print("Using MPS device (Apple Silicon)")
else:
device = torch.device("cpu")
print("Using CPU device (no GPU acceleration available)")
# Load model to the appropriate device
print(f"Starting HNSW server on port {zmq_port} with model {model_name}")
model = AutoModel.from_pretrained(model_name).to(device).eval()
# Check port availability
import socket
def check_port(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0
if check_port(zmq_port):
print(f"{RED}Port {zmq_port} is already in use{RESET}")
return
# Apply model optimizations (similar to DiskANN version)
if use_fp16 and (cuda_available or mps_available):
model = model.half()
model = torch.compile(model)
print(f"Using FP16 precision with model: {model_name}")
elif use_int8:
print("- Using TorchAO for Int8 dynamic activation and Int8 weight quantization")
from torchao.quantization import quantize_, Int8DynamicActivationInt8WeightConfig
quantize_(model, Int8DynamicActivationInt8WeightConfig())
model = torch.compile(model)
model.eval()
print("- Model successfully quantized and compiled")
# Load passages
if passages_data:
passages = SimplePassageLoader(passages_data)
print(f"Using provided passages data: {len(passages)} passages")
elif passages_file:
passages = load_passages_from_file(passages_file)
else:
passages = SimplePassageLoader()
print("No passages provided, using empty loader")
# Load embeddings if provided
_embeddings = None
if embeddings_file and os.path.exists(embeddings_file):
try:
with open(embeddings_file, "rb") as f:
_embeddings = pickle.load(f)
print(f"Loaded embeddings from {embeddings_file}")
except Exception as e:
print(f"Error loading embeddings: {e}")
class DeviceTimer:
"""Device event-based timer for accurate timing."""
def __init__(self, name="", device=device):
self.name = name
self.device = device
self.start_time = 0
self.end_time = 0
if cuda_available:
self.start_event = torch.cuda.Event(enable_timing=True)
self.end_event = torch.cuda.Event(enable_timing=True)
else:
self.start_event = None
self.end_event = None
@contextmanager
def timing(self):
self.start()
yield
self.end()
def start(self):
if cuda_available:
torch.cuda.synchronize()
self.start_event.record()
else:
if self.device.type == "mps":
torch.mps.synchronize()
self.start_time = time.time()
def end(self):
if cuda_available:
self.end_event.record()
torch.cuda.synchronize()
else:
if self.device.type == "mps":
torch.mps.synchronize()
self.end_time = time.time()
def elapsed_time(self):
if cuda_available:
return self.start_event.elapsed_time(self.end_event) / 1000.0
else:
return self.end_time - self.start_time
def print_elapsed(self):
return # Disabled for now
def process_batch(texts_batch, ids_batch, missing_ids):
"""Process a batch of texts and return embeddings"""
_is_e5_model = "e5" in model_name.lower()
batch_size = len(texts_batch)
# E5 model preprocessing
if _is_e5_model:
processed_texts_batch = [f"passage: {text}" for text in texts_batch]
else:
processed_texts_batch = texts_batch
# Set max length
if _is_e5_model:
current_max_length = custom_max_length_param if custom_max_length_param is not None else 512
else:
current_max_length = custom_max_length_param if custom_max_length_param is not None else 256
tokenize_timer = DeviceTimer("tokenization (batch)", device)
to_device_timer = DeviceTimer("transfer to device (batch)", device)
embed_timer = DeviceTimer("embedding (batch)", device)
pool_timer = DeviceTimer("pooling (batch)", device)
norm_timer = DeviceTimer("normalization (batch)", device)
with tokenize_timer.timing():
encoded_batch = tokenizer(
processed_texts_batch,
padding="max_length",
truncation=True,
max_length=current_max_length,
return_tensors="pt",
return_token_type_ids=False,
)
seq_length = encoded_batch["input_ids"].size(1)
with to_device_timer.timing():
enc = {k: v.to(device) for k, v in encoded_batch.items()}
with torch.no_grad():
with embed_timer.timing():
out = model(enc["input_ids"], enc["attention_mask"])
with pool_timer.timing():
if not hasattr(out, 'last_hidden_state'):
if isinstance(out, torch.Tensor) and len(out.shape) == 2:
pooled_embeddings = out
else:
print(f"{RED}ERROR: Cannot determine how to pool. Output shape: {out.shape if isinstance(out, torch.Tensor) else 'N/A'}{RESET}")
hidden_dim = getattr(model.config, 'hidden_size', 384 if _is_e5_model else 768)
pooled_embeddings = torch.zeros((batch_size, hidden_dim), device=device, dtype=enc["input_ids"].dtype if hasattr(enc["input_ids"], "dtype") else torch.float32)
elif _is_e5_model:
pooled_embeddings = e5_average_pool(out.last_hidden_state, enc['attention_mask'])
else:
hidden_states = out.last_hidden_state
mask_expanded = enc["attention_mask"].unsqueeze(-1).expand(hidden_states.size()).float()
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
pooled_embeddings = sum_embeddings / sum_mask
final_embeddings = pooled_embeddings
if _is_e5_model:
with norm_timer.timing():
final_embeddings = F.normalize(pooled_embeddings, p=2, dim=1)
if torch.isnan(final_embeddings).any() or torch.isinf(final_embeddings).any():
print(f"{RED}!!! In process_batch: NaN or Inf detected in final_embeddings! "
f"Model: {model_name}, E5: {_is_e5_model}. IDs (sample): {ids_batch[:5]}...{RESET}")
dim_size = final_embeddings.shape[-1]
error_output = torch.zeros((batch_size, dim_size), device='cpu', dtype=torch.float32).numpy()
print(f"{RED}Returning zero embeddings of shape ({batch_size}, {dim_size}) due to NaN/Inf.{RESET}")
return error_output
return final_embeddings.cpu().numpy()
def client_warmup(zmq_port):
"""Perform client-side warmup"""
time.sleep(2)
print(f"Performing client-side warmup with model {model_name}...")
sample_ids = ["1", "2", "3", "4", "5"]
try:
context = zmq.Context()
socket = context.socket(zmq.REQ)
socket.connect(f"tcp://localhost:{zmq_port}")
socket.setsockopt(zmq.RCVTIMEO, 30000)
socket.setsockopt(zmq.SNDTIMEO, 30000)
try:
ids_to_send = [int(x) for x in sample_ids]
except ValueError:
ids_to_send = []
if not ids_to_send:
print("Skipping warmup send.")
return
request_payload = [ids_to_send]
request_bytes = msgpack.packb(request_payload)
for i in range(3):
print(f"Sending warmup request {i+1}/3 via ZMQ (MessagePack)...")
socket.send(request_bytes)
response_bytes = socket.recv()
response_payload = msgpack.unpackb(response_bytes)
dimensions = response_payload[0]
embeddings_count = dimensions[0] if dimensions and len(dimensions) > 0 else 0
print(f"Warmup request {i+1}/3 successful, received {embeddings_count} embeddings")
time.sleep(0.1)
print("Client-side MessagePack ZMQ warmup complete")
socket.close()
context.term()
except Exception as e:
print(f"Error during MessagePack ZMQ warmup: {e}")
def zmq_server_thread():
"""ZMQ server thread"""
context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind(f"tcp://*:{zmq_port}")
print(f"HNSW ZMQ server listening on port {zmq_port}")
socket.setsockopt(zmq.RCVTIMEO, 300000)
socket.setsockopt(zmq.SNDTIMEO, 300000)
while True:
try:
message_bytes = socket.recv()
print(f"Received ZMQ request of size {len(message_bytes)} bytes")
e2e_start = time.time()
lookup_timer = DeviceTimer("text lookup", device)
try:
request_payload = msgpack.unpackb(message_bytes)
# Handle distance calculation requests
if isinstance(request_payload, list) and len(request_payload) == 2 and isinstance(request_payload[0], list) and isinstance(request_payload[1], list):
node_ids = request_payload[0]
query_vector = np.array(request_payload[1], dtype=np.float32)
print(f"Request for distance calculation: {len(node_ids)} nodes, query vector dim: {len(query_vector)}")
# Get embeddings for node IDs
texts = []
missing_ids = []
with lookup_timer.timing():
for nid in node_ids:
txtinfo = passages[nid]
if txtinfo is None or txtinfo["text"] == "":
print(f"Warning: Passage with ID {nid} not found")
missing_ids.append(nid)
txt = ""
else:
txt = txtinfo["text"]
texts.append(txt)
lookup_timer.print_elapsed()
# Process embeddings in chunks if needed
all_node_embeddings = []
total_size = len(texts)
if total_size > max_batch_size:
for i in range(0, total_size, max_batch_size):
end_idx = min(i + max_batch_size, total_size)
chunk_texts = texts[i:end_idx]
chunk_ids = node_ids[i:end_idx]
embeddings_chunk = process_batch(chunk_texts, chunk_ids, missing_ids)
all_node_embeddings.append(embeddings_chunk)
if cuda_available:
torch.cuda.empty_cache()
elif device.type == "mps":
torch.mps.empty_cache()
node_embeddings = np.vstack(all_node_embeddings)
else:
node_embeddings = process_batch(texts, node_ids, missing_ids)
# Calculate distances
query_tensor = torch.tensor(query_vector, device=device).float()
node_embeddings_tensor = torch.tensor(node_embeddings, device=device).float()
calc_timer = DeviceTimer("distance calculation", device)
with calc_timer.timing():
with torch.no_grad():
if is_similarity_metric():
node_embeddings_np = node_embeddings_tensor.cpu().numpy()
query_np = query_tensor.cpu().numpy()
distances = -np.dot(node_embeddings_np, query_np)
else:
node_embeddings_np = node_embeddings_tensor.cpu().numpy().astype(np.float32)
query_np = query_tensor.cpu().numpy().astype(np.float32)
distances = np.sum(np.square(node_embeddings_np - query_np.reshape(1, -1)), axis=1)
calc_timer.print_elapsed()
try:
response_payload = distances.flatten().tolist()
response_bytes = msgpack.packb([response_payload], use_single_float=True)
print(f"Sending distance response with {len(distances)} distances")
except Exception as pack_error:
print(f"Error packing MessagePack distance response: {pack_error}")
response_bytes = msgpack.packb([[]])
socket.send(response_bytes)
if device.type == "cuda":
torch.cuda.synchronize()
elif device.type == "mps":
torch.mps.synchronize()
e2e_end = time.time()
print(f"Distance calculation E2E time: {e2e_end - e2e_start:.6f} seconds")
continue
# Standard embedding request
if not isinstance(request_payload, list) or len(request_payload) != 1 or not isinstance(request_payload[0], list):
print(f"Error: Invalid MessagePack request format. Expected [[ids...]], got: {type(request_payload)}")
socket.send(msgpack.packb([[], []]))
continue
node_ids = request_payload[0]
print(f"Request for {len(node_ids)} node embeddings")
except Exception as unpack_error:
print(f"Error unpacking MessagePack request: {unpack_error}")
socket.send(msgpack.packb([[], []]))
continue
# Look up texts by node IDs
texts = []
missing_ids = []
with lookup_timer.timing():
for nid in node_ids:
txtinfo = passages[nid]
if txtinfo is None or txtinfo["text"] == "":
print(f"Warning: Passage with ID {nid} not found")
missing_ids.append(nid)
txt = ""
else:
txt = txtinfo["text"]
texts.append(txt)
lookup_timer.print_elapsed()
if missing_ids:
print(f"Missing passages for IDs: {missing_ids}")
# Process in chunks
total_size = len(texts)
print(f"Total batch size: {total_size}, max_batch_size: {max_batch_size}")
all_embeddings = []
if total_size > max_batch_size:
print(f"Splitting batch of size {total_size} into chunks of {max_batch_size}")
for i in range(0, total_size, max_batch_size):
end_idx = min(i + max_batch_size, total_size)
print(f"Processing chunk {i//max_batch_size + 1}/{(total_size + max_batch_size - 1)//max_batch_size}: items {i} to {end_idx-1}")
chunk_texts = texts[i:end_idx]
chunk_ids = node_ids[i:end_idx]
embeddings_chunk = process_batch(chunk_texts, chunk_ids, missing_ids)
all_embeddings.append(embeddings_chunk)
if cuda_available:
torch.cuda.empty_cache()
elif device.type == "mps":
torch.mps.empty_cache()
hidden = np.vstack(all_embeddings)
print(f"Combined embeddings shape: {hidden.shape}")
else:
hidden = process_batch(texts, node_ids, missing_ids)
# Serialization and response
ser_start = time.time()
print(f"DEBUG zmq_server_thread: Final 'hidden' array | Shape: {hidden.shape} | Dtype: {hidden.dtype} | Has NaN/Inf: {np.isnan(hidden).any() or np.isinf(hidden).any()}")
if np.isnan(hidden).any() or np.isinf(hidden).any():
print(f"{RED}!!! ERROR: NaN or Inf detected in final 'hidden' numpy array BEFORE sending! "
f"Requested IDs (sample): {node_ids[:5]}...{RESET}")
assert False
try:
hidden_contiguous_f32 = np.ascontiguousarray(hidden, dtype=np.float32)
response_payload = [
list(hidden_contiguous_f32.shape),
hidden_contiguous_f32.flatten().tolist()
]
response_bytes = msgpack.packb(response_payload, use_single_float=True)
except Exception as pack_error:
print(f"Error packing MessagePack response: {pack_error}")
response_bytes = msgpack.packb([[], []])
socket.send(response_bytes)
ser_end = time.time()
print(f"Serialize time: {ser_end - ser_start:.6f} seconds")
if device.type == "cuda":
torch.cuda.synchronize()
elif device.type == "mps":
torch.mps.synchronize()
e2e_end = time.time()
print(f"ZMQ E2E time: {e2e_end - e2e_start:.6f} seconds")
except zmq.Again:
print("ZMQ socket timeout, continuing to listen")
continue
except Exception as e:
print(f"Error in ZMQ server loop: {e}")
import traceback
traceback.print_exc()
try:
socket.send(msgpack.packb([[], []]))
except:
pass
# Start warmup and server threads
if len(passages) > 0:
warmup_thread = threading.Thread(target=client_warmup, args=(zmq_port,))
warmup_thread.daemon = True
warmup_thread.start()
zmq_thread = threading.Thread(target=zmq_server_thread, daemon=True)
zmq_thread.start()
print(f"Started HNSW ZMQ server thread on port {zmq_port}")
# Keep the main thread alive
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("HNSW Server shutting down...")
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="HNSW Embedding service")
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
parser.add_argument("--passages-file", type=str, help="JSON file containing passage ID to text mapping")
parser.add_argument("--embeddings-file", type=str, help="Pickle file containing pre-computed embeddings")
parser.add_argument("--use-fp16", action="store_true", default=False)
parser.add_argument("--use-int8", action="store_true", default=False)
parser.add_argument("--use-cuda-graphs", action="store_true", default=False)
parser.add_argument("--max-batch-size", type=int, default=128, help="Maximum batch size before splitting")
parser.add_argument("--model-name", type=str, default="sentence-transformers/all-mpnet-base-v2",
help="Embedding model name")
parser.add_argument("--custom-max-length", type=int, default=None, help="Override model's default max sequence length")
args = parser.parse_args()
# Create and start the HNSW embedding server
create_hnsw_embedding_server(
passages_file=args.passages_file,
embeddings_file=args.embeddings_file,
use_fp16=args.use_fp16,
use_int8=args.use_int8,
use_cuda_graphs=args.use_cuda_graphs,
zmq_port=args.zmq_port,
max_batch_size=args.max_batch_size,
model_name=args.model_name,
custom_max_length_param=args.custom_max_length,
)

View File

@@ -0,0 +1,18 @@
# 文件: packages/leann-backend-hnsw/pyproject.toml
[build-system]
requires = ["scikit-build-core>=0.10", "numpy", "swig"]
build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-hnsw"
version = "0.1.0"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = ["leann-core==0.1.0", "numpy"]
# 回归到最标准的 scikit-build-core 配置
[tool.scikit-build]
wheel.packages = ["leann_backend_hnsw"]
editable.mode = "redirect"
cmake.build-type = "Debug"
build.verbose = true

View File

@@ -0,0 +1,88 @@
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: false
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false # at some point, set this to true
BinPackParameters: false # at some point, set this to true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ]
IncludeCategories:
- Regex: '^<.*\.h(pp)?>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IndentCaseLabels: true
IndentWidth: 4
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 2000000
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...

View File

@@ -0,0 +1 @@
sift1M

View File

@@ -0,0 +1,33 @@
# Summary
<!-- Facebook has a bounty program for the safe disclosure of security bugs. In
those cases, please go through the process outlined on that page and do not
file a public issue. -->
# Platform
<!-- if the question/problem is not platform-specific, please ignore this -->
OS: <!-- e.g. macOS 10.13.3 -->
Faiss version: <!-- git commit, e.g. 56383610bcb982d6591e2e2bea3516cb7723e04a -->
Installed from: <!-- anaconda? compiled by yourself ? -->
Faiss compilation options: <!-- e.g. using MKL with compile flags ... -->
Running on:
- [ ] CPU
- [ ] GPU
Interface:
- [ ] C++
- [ ] Python
# Reproduction instructions
<!-- Please provide specific and comprehensive instructions to reproduce the
described behavior. -->
<!-- Please *do not* post screenshots of logs. They are not searchable. Copy/paste
the text or make a gist if the text is too bulky. -->

View File

@@ -0,0 +1,189 @@
name: Build cmake
inputs:
opt_level:
description: 'Compile options / optimization level.'
required: false
default: generic
gpu:
description: 'Enable GPU support.'
required: false
default: OFF
cuvs:
description: 'Enable cuVS support.'
required: false
default: OFF
rocm:
description: 'Enable ROCm support.'
required: false
default: OFF
runs:
using: composite
steps:
- name: Setup miniconda
uses: conda-incubator/setup-miniconda@v3
with:
python-version: '3.11'
miniforge-version: latest # ensures conda-forge channel is used.
channels: conda-forge
conda-remove-defaults: 'true'
# Set to aarch64 if we're on arm64 because there's no miniforge ARM64 package, just aarch64.
# They are the same thing, just named differently.
architecture: ${{ runner.arch == 'ARM64' && 'aarch64' || runner.arch }}
- name: Configure build environment
shell: bash
run: |
# initialize Conda
conda config --set solver libmamba
# Ensure starting packages are from conda-forge.
conda list --show-channel-urls
conda update -y -q conda
echo "$CONDA/bin" >> $GITHUB_PATH
conda install -y -q python=3.11 cmake=3.26 make=4.2 swig=4.0 "numpy<2" scipy=1.14 pytest=7.4 gflags=2.2
# install base packages for ARM64
if [ "${{ runner.arch }}" = "ARM64" ]; then
conda install -y -q -c conda-forge openblas=0.3.29 gxx_linux-aarch64=14.2 sysroot_linux-aarch64=2.17
fi
# install base packages for X86_64
if [ "${{ runner.arch }}" = "X64" ]; then
# TODO: merge this with ARM64
conda install -y -q -c conda-forge gxx_linux-64=14.2 sysroot_linux-64=2.17
conda install -y -q mkl=2022.2.1 mkl-devel=2022.2.1
fi
# no CUDA needed for ROCm so skip this
if [ "${{ inputs.rocm }}" = "ON" ]; then
:
# regular CUDA for GPU builds
elif [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.cuvs }}" = "OFF" ]; then
conda install -y -q cuda-toolkit=12.4 -c "nvidia/label/cuda-12.4.0"
# and CUDA from cuVS channel for cuVS builds
elif [ "${{ inputs.cuvs }}" = "ON" ]; then
conda install -y -q libcuvs=24.12 'cuda-version>=12.0,<=12.5' cuda-toolkit=12.4.1 gxx_linux-64=12.4 -c rapidsai -c conda-forge
fi
# install test packages
if [ "${{ inputs.rocm }}" = "ON" ]; then
: # skip torch install via conda, we need to install via pip to get
# ROCm-enabled version until it's supported in conda by PyTorch
elif [ "${{ inputs.gpu }}" = "ON" ]; then
conda install -y -q "pytorch<2.5" pytorch-cuda=12.4 -c pytorch -c "nvidia/label/cuda-12.4.0"
else
conda install -y -q "pytorch<2.5" -c pytorch
fi
- name: ROCm - Install dependencies
if: inputs.rocm == 'ON'
shell: bash
run: |
# Update repos and install kmod, wget, gpg
sudo apt-get -qq update >/dev/null
sudo apt-get -qq install -y kmod wget gpg >/dev/null
# Get UBUNTU version name
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
# Set ROCm version
ROCM_VERSION="6.2"
# Download, prepare, and install the package signing key
mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
# Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get -qq update --allow-insecure-repositories >/dev/null
sudo apt-get -qq install -y --allow-unauthenticated \
"rocm-dev${ROCM_VERSION}" "rocm-utils${ROCM_VERSION}" \
"rocm-libs${ROCM_VERSION}" >/dev/null
# Fake presence of MI200-class accelerators
echo "gfx90a" | sudo tee /opt/rocm/bin/target.lst
# Cleanup
sudo apt-get -qq autoclean >/dev/null
sudo apt-get -qq clean >/dev/null
sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
- name: Symblink system dependencies
if: inputs.rocm == 'ON'
shell: bash
run: |
# symblink system libraries for HIP compiler
sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0
sudo ln -s $HOME/miniconda3/x86_64-conda-linux-gnu/sysroot/usr/lib64/libpthread_nonshared.a /usr/lib64/libpthread_nonshared.a
- name: Build all targets
shell: bash
run: |
eval "$(conda shell.bash hook)"
conda activate
cmake -B build \
-DBUILD_TESTING=ON \
-DBUILD_SHARED_LIBS=ON \
-DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
-DFAISS_ENABLE_CUVS=${{ inputs.cuvs }} \
-DFAISS_ENABLE_ROCM=${{ inputs.rocm }} \
-DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
-DFAISS_ENABLE_C_API=ON \
-DPYTHON_EXECUTABLE=$CONDA/bin/python \
-DCMAKE_BUILD_TYPE=Release \
-DBLA_VENDOR=${{ runner.arch == 'X64' && 'Intel10_64_dyn' || '' }} \
-DCMAKE_CUDA_FLAGS=${{ runner.arch == 'X64' && '"-gencode arch=compute_75,code=sm_75"' || '' }} \
.
make -k -C build -j$(nproc)
- name: C++ tests
shell: bash
run: |
export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
make -C build test
- name: C++ perf benchmarks
shell: bash
if: inputs.rocm == 'OFF'
run: |
find ./build/perf_tests/ -executable -type f -name "bench*" -exec '{}' -v \;
- name: Install Python extension
shell: bash
working-directory: build/faiss/python
run: |
$CONDA/bin/python setup.py install
- name: ROCm - install ROCm-enabled torch via pip
if: inputs.rocm == 'ON'
shell: bash
run: |
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
- name: Python tests (CPU only)
if: inputs.gpu == 'OFF'
shell: bash
run: |
pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
- name: Python tests (CPU + GPU)
if: inputs.gpu == 'ON'
shell: bash
run: |
pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
cp tests/common_faiss_tests.py faiss/gpu/test
pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
- name: Test avx2 loading
if: inputs.opt_level == 'avx2'
shell: bash
run: |
FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-arch=${{ runner.arch }}-opt=${{ inputs.opt_level }}-gpu=${{ inputs.gpu }}-cuvs=${{ inputs.cuvs }}-rocm=${{ inputs.rocm }}
path: test-results
- name: Check installed packages channel
shell: bash
run: |
# Shows that all installed packages are from conda-forge.
conda list --show-channel-urls

View File

@@ -0,0 +1,107 @@
name: Conda build
description: Builds Faiss inside a Conda environment and uploads to repository when label is provided.
inputs:
label:
description: "The label to be used for uploads to Conda."
default: ""
required: false
cuda:
description: "CUDA toolkit version to use."
default: ""
required: false
cuvs:
description: "Enable cuVS support."
default: ""
required: false
runs:
using: composite
steps:
- name: Choose shell
shell: bash
id: choose_shell
run: |
# Use pwsh on Windows; bash everywhere else
if [ "${{ runner.os }}" != "Windows" ]; then
echo "shell=bash" >> "$GITHUB_OUTPUT"
else
echo "shell=pwsh" >> "$GITHUB_OUTPUT"
fi
- name: Setup miniconda
uses: conda-incubator/setup-miniconda@v3
with:
python-version: '3.11'
miniforge-version: latest # ensures conda-forge channel is used.
channels: conda-forge
conda-remove-defaults: 'true'
# Set to runner.arch=aarch64 if we're on arm64 because
# there's no miniforge ARM64 package, just aarch64.
# They are the same thing, just named differently.
# However there is an ARM64 for macOS, so exclude that.
architecture: ${{ (runner.arch == 'ARM64' && runner.os != 'macOS') && 'aarch64' || runner.arch }}
- name: Install conda build tools
shell: ${{ steps.choose_shell.outputs.shell }}
run: |
# Ensure starting packages are from conda-forge.
conda list --show-channel-urls
conda install -y -q "conda!=24.11.0"
conda install -y -q "conda-build!=24.11.0" "liblief=0.14.1"
conda list --show-channel-urls
- name: Enable anaconda uploads
if: inputs.label != ''
shell: ${{ steps.choose_shell.outputs.shell }}
env:
PACKAGE_TYPE: ${{ inputs.label }}
run: |
conda install -y -q anaconda-client
conda config --set anaconda_upload yes
- name: Conda build (CPU)
if: inputs.label == '' && inputs.cuda == ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
run: |
conda build faiss --python 3.11 -c pytorch
- name: Conda build (CPU) w/ anaconda upload
if: inputs.label != '' && inputs.cuda == ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
env:
PACKAGE_TYPE: ${{ inputs.label }}
run: |
conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
- name: Conda build (GPU)
if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs == ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
run: |
conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
- name: Conda build (GPU) w/ anaconda upload
if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs == ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
env:
PACKAGE_TYPE: ${{ inputs.label }}
run: |
conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
--user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
- name: Conda build (GPU w/ cuVS)
if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs != ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
run: |
conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
-c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
- name: Conda build (GPU w/ cuVS) w/ anaconda upload
if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
shell: ${{ steps.choose_shell.outputs.shell }}
working-directory: conda
env:
PACKAGE_TYPE: ${{ inputs.label }}
run: |
conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}" }' \
--user pytorch --label ${{ inputs.label }} -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia
- name: Check installed packages channel
shell: ${{ steps.choose_shell.outputs.shell }}
run: |
# Shows that all installed packages are from conda-forge.
conda list --show-channel-urls

View File

@@ -0,0 +1,23 @@
name: Close Inactive Issues
on:
schedule:
- cron: "30 1 * * *"
jobs:
close-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v5
with:
only-labels: autoclose
days-before-issue-stale: 7
days-before-issue-close: 7
stale-issue-label: "stale"
stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
repo-token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -0,0 +1,169 @@
on:
workflow_call:
env:
OMP_NUM_THREADS: '10'
MKL_THREADING_LAYER: GNU
jobs:
format:
name: Format
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install clang-format
run: |
sudo apt-get update -y
sudo apt-get install -y wget
sudo apt install -y lsb-release wget software-properties-common gnupg
wget https://apt.llvm.org/llvm.sh
chmod u+x llvm.sh
sudo ./llvm.sh 18
sudo apt-get install -y git-core clang-format-18
- name: Verify clang-format
run: |
git ls-files | grep -E '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
if git diff --quiet; then
echo "Formatting OK!"
else
echo "Formatting not OK!"
echo "------------------"
git --no-pager diff --color
exit 1
fi
linux-x86_64-cmake:
name: Linux x86_64 (cmake)
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
linux-x86_64-AVX2-cmake:
name: Linux x86_64 AVX2 (cmake)
needs: linux-x86_64-cmake
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
opt_level: avx2
linux-x86_64-AVX512-cmake:
name: Linux x86_64 AVX512 (cmake)
needs: linux-x86_64-cmake
runs-on: faiss-aws-m7i.large
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
opt_level: avx512
linux-x86_64-AVX512_SPR-cmake:
name: Linux x86_64 AVX512_SPR (cmake)
needs: linux-x86_64-cmake
runs-on: faiss-aws-m7i.large
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
opt_level: avx512_spr
linux-x86_64-GPU-cmake:
name: Linux x86_64 GPU (cmake)
needs: linux-x86_64-cmake
runs-on: 4-core-ubuntu-gpu-t4
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
gpu: ON
linux-x86_64-GPU-w-CUVS-cmake:
name: Linux x86_64 GPU w/ cuVS (cmake)
needs: linux-x86_64-cmake
runs-on: 4-core-ubuntu-gpu-t4
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
gpu: ON
cuvs: ON
linux-x86_64-GPU-w-ROCm-cmake:
name: Linux x86_64 GPU w/ ROCm (cmake)
needs: linux-x86_64-cmake
runs-on: faiss-amd-MI200
container:
image: ubuntu:22.04
options: --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN
steps:
- name: Container setup
run: |
if [ -f /.dockerenv ]; then
apt-get update && apt-get install -y sudo && apt-get install -y git
git config --global --add safe.directory '*'
else
echo 'Skipping. Current job is not running inside a container.'
fi
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
gpu: ON
rocm: ON
linux-arm64-SVE-cmake:
name: Linux arm64 SVE (cmake)
needs: linux-x86_64-cmake
runs-on: faiss-aws-r8g.large
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test (cmake)
uses: ./.github/actions/build_cmake
with:
opt_level: sve
env:
# Context: https://github.com/facebookresearch/faiss/wiki/Troubleshooting#surprising-faiss-openmp-and-openblas-interaction
OPENBLAS_NUM_THREADS: '1'
linux-x86_64-conda:
name: Linux x86_64 (conda)
needs: linux-x86_64-cmake
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
windows-x86_64-conda:
name: Windows x86_64 (conda)
needs: linux-x86_64-cmake
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
linux-arm64-conda:
name: Linux arm64 (conda)
needs: linux-x86_64-cmake
runs-on: 2-core-ubuntu-arm
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda

View File

@@ -0,0 +1,144 @@
on:
workflow_call:
secrets:
ANACONDA_API_TOKEN:
required: true
env:
OMP_NUM_THREADS: '10'
MKL_THREADING_LAYER: GNU
jobs:
linux-x86_64-packages:
name: Linux x86_64 packages
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
linux-x86_64-GPU-packages-CUDA-11-4-4:
name: Linux x86_64 GPU packages (CUDA 11.4.4)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
FAISS_FLATTEN_CONDA_INCLUDES: "1"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
cuda: "11.4.4"
linux-x86_64-GPU-CUVS-packages-CUDA11-8-0:
name: Linux x86_64 GPU w/ cuVS packages (CUDA 11.8.0)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
cuvs: "ON"
cuda: "11.8.0"
linux-x86_64-GPU-packages-CUDA-12-1-1:
name: Linux x86_64 GPU packages (CUDA 12.1.1)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
cuda: "12.1.1"
linux-x86_64-GPU-CUVS-packages-CUDA12-4-0:
name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.4.0)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
cuvs: "ON"
cuda: "12.4.0"
windows-x86_64-packages:
name: Windows x86_64 packages
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
osx-arm64-packages:
name: OSX arm64 packages
runs-on: macos-14
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main
linux-arm64-packages:
name: Linux arm64 packages
runs-on: 2-core-ubuntu-arm
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Build and Package (conda)
uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: main

View File

@@ -0,0 +1,17 @@
name: Build
on:
workflow_dispatch:
pull_request:
branches:
- main
push:
tags:
- 'v*'
jobs:
build-pull-request:
uses: ./.github/workflows/build-pull-request.yml
build-release:
uses: ./.github/workflows/build-release.yml
secrets:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')

View File

@@ -0,0 +1,148 @@
name: Nightly
on:
schedule:
- cron: '10 6 * * *'
env:
OMP_NUM_THREADS: '10'
MKL_THREADING_LAYER: GNU
jobs:
linux-x86_64-nightly:
name: Linux x86_64 nightlies
runs-on: 4-core-ubuntu
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
linux-x86_64-GPU-CUDA-11-4-4-nightly:
name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
FAISS_FLATTEN_CONDA_INCLUDES: "1"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
cuda: "11.4.4"
linux-x86_64-GPU-CUVS-CUDA11-8-0-nightly:
name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 11.8.0)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
cuvs: "ON"
cuda: "11.8.0"
linux-x86_64-GPU-CUDA-12-1-1-nightly:
name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
cuda: "12.1.1"
linux-x86_64-GPU-CUVS-CUDA12-4-0-nightly:
name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.4.0)
runs-on: 4-core-ubuntu-gpu-t4
env:
CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
cuvs: "ON"
cuda: "12.4.0"
windows-x86_64-nightly:
name: Windows x86_64 nightlies
runs-on: windows-2019
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
osx-arm64-nightly:
name: OSX arm64 nightlies
runs-on: macos-14
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
linux-arm64-nightly:
name: Linux arm64 nightlies
runs-on: 2-core-ubuntu-arm
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- uses: ./.github/actions/build_conda
env:
ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
with:
label: nightly
auto-retry:
name: Auto retry on failure
if: fromJSON(github.run_attempt) < 2
runs-on: ubuntu-latest
steps:
- name: Start rerun workflow
env:
GH_REPO: ${{ github.repository }}
GH_TOKEN: ${{ github.token }}
GH_DEBUG: api
run: |
gh workflow run retry_build.yml \
-F run_id=${{ github.run_id }}

View File

@@ -0,0 +1,44 @@
name: Publish Docs
on:
page_build:
branches:
- gh-pages
paths-ignore:
- 'docs/**'
workflow_run:
workflows: [update-doxygen]
types:
- completed
jobs:
build_and_publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Checkout gh-pages
run: |
git fetch origin gh-pages
git checkout gh-pages
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Generate html
run: |
make html
git rm -rf docs
mv _build/html docs
touch docs/.nojekyll
- name: Push changes
run: |
git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
git config --global user.name "$GITHUB_ACTOR"
git add docs
if [ -n "$(git status --porcelain)" ]
then
git commit docs -m "Sphinx rebuild ($(git rev-parse --short gh-pages))."
git push origin gh-pages
fi

View File

@@ -0,0 +1,33 @@
name: Retry Build
on:
workflow_dispatch:
inputs:
run_id:
required: true
jobs:
rerun-on-failure:
permissions: write-all
runs-on: ubuntu-latest
steps:
- name: rerun ${{ inputs.run_id }}
env:
GH_REPO: ${{ github.repository }}
GH_TOKEN: ${{ github.token }}
GH_DEBUG: api
run: |
# status can be one of "queued", "in_progress", "completed", "waiting", "requested", "pending"
# https://docs.github.com/en/rest/checks/runs
# while not completed, sleep for 10 minutes
while gh run view ${{ inputs.run_id }} --json status | grep -v completed
do
echo Workflow in progress - sleeping for 10 minutes then checking again
sleep 10m
done
# Only retry if there are failed jobs
if gh run view ${{ inputs.run_id }} --exit-status; then
echo Workflow succeeded - no retry necessary.
else
echo Workflow failed - initiating retry.
gh run rerun ${{ inputs.run_id }} --failed
fi

View File

@@ -0,0 +1,40 @@
name: Update Doxygen
on:
push:
branches:
- main
paths:
- 'faiss/**'
jobs:
doxygen:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install dependencies
run: |
sudo apt-get install -y doxygen
python -m pip install --upgrade pip
pip install breathe
- name: Generate doxygen xml
run: doxygen
- name: Push changes
run: |
git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com"
git config --global user.name "$GITHUB_ACTOR"
mkdir ./tmp
mv xml ./tmp/xml
git fetch origin gh-pages
git checkout gh-pages
git rm -rf xml cpp_api
mv ./tmp/xml ./xml
breathe-apidoc -o cpp_api xml
git add xml cpp_api
if [ -n "$(git status --porcelain)" ]
then
git commit -m "Update API docs ($(git rev-parse --short main))."
git push origin gh-pages
fi

View File

@@ -0,0 +1,26 @@
*.swp
*.swo
*.o
*.a
*.dSYM
*.so
*.dylib
*.pyc
*~
/build/
/config.*
/aclocal.m4
/autom4te.cache/
/makefile.inc
/bin/
/c_api/bin/
/c_api/gpu/bin/
/tests/test
/tests/gtest/
faiss/python/swigfaiss_avx2.swig
faiss/python/swigfaiss_avx512.swig
faiss/python/swigfaiss_avx512_spr.swig
faiss/python/swigfaiss_sve.swig
.cache/
compile_commands.json
sift/

View File

@@ -0,0 +1,19 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Build Demo",
"type": "lldb",
"request": "launch",
"program": "${workspaceFolder}/../.venv/bin/python",
"console": "integratedTerminal",
"cwd": "${workspaceFolder}",
"args": [
"${workspaceFolder}/demo/build_demo.py"
],
},
]
}

View File

@@ -0,0 +1,482 @@
# Changelog
All notable changes to this project will be documented in this file.
## [Unreleased]
## [1.10.0] - 2025-01-30
Added
- Add desc_name to dataset descriptor (#3935)
- implement ST_norm_from_LUT for the ResidualQuantizer (#3917)
- Add example of how to build, link, and test an external SWIG module (#3922)
- add copyright header (#3948)
- Add some SVE implementations (#3933)
- Enable linting: lint config changes plus arc lint command (#3966)
- Re-add example of how to build, link, and test an external SWIG module (#3981)
- demo: IndexPQ: separate codes from codebook (#3987)
- add all wrapped indexes to the index_read (#3988)
- add validity check AlignedTableTightAlloc clear method (#3997)
- Add index binary to telemetry (#4001)
- Add VectorTransform read from filename to the C API (#3970)
- Added IndexLSH to the demo (#4009)
- write distributed_kmeans centroids and assignments to hive tables (#4017)
- introduce data splits in dataset descriptor (#4012)
- Faiss GPU: bfloat16 brute-force kNN support (#4018)
- ROCm support for bfloat16 (#4039)
- Unit tests for distances_simd.cpp (#4058)
- add cuda-toolkit for GPU (#4057)
- Add more unit testing for IndexHNSW [1/n] (#4054)
- Add more unit testing for IndexHNSW [2/n] (#4056)
- Add more unit testing for HNSW [3/n] (#4059)
- Add more unit testing for HNSW [4/n] (#4061)
- Add more unit tests for index_read and index_write (#4068)
- Add testing for utils/hamming.cpp (#4079)
- Test sa_decode methd on IndexIVFFlat (#4098)
- Conditionally compile extras like benchmarks and demos (#4094)
- Add a new architecture mode: 'avx512_spr'. (#4025)
- Use _mm512_popcnt_epi64 to speedup hamming distance evaluation. (#4020)
- PQ with pytorch (#4116)
- add range_search() to IndexRefine (#4022)
- Expose accumulate_to_mem from faiss interface (#4099)
- Windows Arm64 support (#4087)
- add test to cover GPU (#4130)
- Added support for building without MKL (#4147)
Changed
- Move train, build and search to their respective operators (#3934)
- PQFS into Index trainer (#3941)
- Place a useful cmake function 'link_to_faiss_lib' into a separate file (#3939)
- Cache device major version value to avoid multiple calls of getCudaDeviceProperties (#3950)
- Consolidate set_target_properties() calls in faiss/CMakeLists.txt (#3973)
- Removing Manual Hipify Build Step (#3962)
- Allow to replace graph structure for NSG graphs (#3975)
- Adjust nightly build (#3978)
- Update RAFT CI with pytorch 2.4.1 (#3980)
- Moved add_sa_codes, sa_code_size to Index, IndexBinary base classes (#3989)
- Update autoclose.yml (#4000)
- Migrate from RAFT to CUVS (#3549)
- Pin to numpy<2 (#4033)
- (1/n) - Preload datasets in manifold so that subsequent stages of training, indexing and search can use those instead of each trainer or indexer downloading data. (#4034)
- Constrain conda version for Windows build (#4040)
- Updates to faiss-gpu-cuvs nightly pkg (#4032)
- pin the dependecies version for x86_64 (#4046)
- pin arm64 dependency (#4060)
- Pin conda build (#4062)
- Improve naming due to codemod (#4063)
- Improve naming due to codemod (#4064)
- Improve naming due to codemod (#4065)
- separare the github build into two conditions (#4066)
- Improve naming due to codemod (#4070)
- improve naming due to codemod (#4067)
- improve naming due to codemod (#4071)
- improve naming due to codemod (#4072)
- fix nightily build (#4080)
- Change github action workflows name (#4083)
- Resolve Packaging Issues (#4044)
- Update __init__.py (#4086)
- Exhaustive IVF probing in scalar quantizer tests (#4075)
- Pin Nightlies with testing on PR (#4088)
- Update benchmarking library code to work for IdMap index as well (#4093)
- Update action.yml (#4100)
- Upgrade CUVS to 24.12 (#4021)
- Link cuVS Docs (#4084)
- Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors (#4109)
- enable quiet mode for conda install (#4112)
- Disable retry build (#4124)
- Add ngpu default argument to knn_ground_truth (#4123)
- Update code comment to reflect the range of IF from [1, k] (#4139)
- Reenable auto retry workflow (#4140)
- Migration off defaults to conda-forge channel (#4126)
- Benchmarking Scripts for cuVS Index, more docs updates (#4117)
Fixed
- Fix total_rows (#3942)
- Fix INSTALL.md due to failure of conflict resolving (#3915)
- Back out "Add example of how to build, link, and test an external SWIG module" (#3954)
- Fix shadowed variable in faiss/IndexPQ.cpp (#3959)
- Fix shadowed variable in faiss/IndexIVFAdditiveQuantizer.cpp (#3958)
- Fix shadowed variable in faiss/impl/HNSW.cpp (#3961)
- Fix shadowed variable in faiss/impl/simd_result_handlers.h (#3960)
- Fix shadowed variable in faiss/utils/NeuralNet.cpp (#3952)
- Resolve "incorrect-portions-license" errors: add no license lint to top of GPU files with both licenses (#3965)
- Resolve "duplicate-license-header": Find and replace duplicate license headers (#3967)
- fix some more nvidia licenses that get erased (#3977)
- fix merge_flat_ondisk stress run failures (#3999)
- Fix reverse_index_factory formatting of ScalarQuantizers (#4003)
- Fix shadowed variable in faiss/IndexAdditiveQuantizer.cpp (#4011)
- facebook-unused-include-check in fbcode/faiss (#4029)
- fix linter (#4035)
- Some chore fixes (#4010)
- Fix unused variable compilation error (#4041)
- stop dealloc of coarse quantizer when it is deleted (#4045)
- Fix SCD Table test flakiness (#4069)
- Fix IndexIVFFastScan reconstruct_from_offset method (#4095)
- more fast-scan reconstruction (#4128)
- Fix nightly cuVS 11.8.0 failure (#4149)
- Correct capitalization of FAISS to Faiss (#4155)
- Fix cuVS 12.4.0 nightly failure (#4153)
Deprecated
- Remove unused-variable in dumbo/backup/dumbo/service/tests/ChainReplicatorTests.cpp (#4024)
- remove inconsistent oom exception test (#4052)
- Remove unused(and wrong) io macro (#4122)
## [1.9.0] - 2024-10-04
### Added
- Add AVX-512 implementation for the distance and scalar quantizer functions. (#3853)
- Allow k and M suffixes in IVF indexes (#3812)
- add reconstruct support to additive quantizers (#3752)
- introduce options for reducing the overhead for a clustering procedure (#3731)
- Add hnsw search params for bounded queue option (#3748)
- ROCm support (#3462)
- Add sve targets (#2886)
- add get_version() for c_api (#3688)
- QINCo implementation in CPU Faiss (#3608)
- Add search functionality to FlatCodes (#3611)
- add dispatcher for VectorDistance and ResultHandlers (#3627)
- Add SQ8bit signed quantization (#3501)
- Add ABS_INNER_PRODUCT metric (#3524)
- Interop between CAGRA and HNSW (#3252)
- add skip_storage flag to HNSW (#3487)
- QT_bf16 for scalar quantizer for bfloat16 (#3444)
- Implement METRIC.NaNEuclidean (#3414)
- TimeoutCallback C++ and Python (#3417)
- support big-endian machines (#3361)
- Support for Remove ids from IVFPQFastScan index (#3354)
- Implement reconstruct_n for GPU IVFFlat indexes (#3338)
- Support of skip_ids in merge_from_multiple function of OnDiskInvertedLists (#3327)
- Add the ability to clone and read binary indexes to the C API. (#3318)
- AVX512 for PQFastScan (#3276)
### Changed
- faster hnsw CPU index training (#3822)
- Some small improvements. (#3692)
- First attempt at LSH matching with nbits (#3679)
- Set verbosoe before train (#3619)
- Remove duplicate NegativeDistanceComputer instances (#3450)
- interrupt for NNDescent (#3432)
- Get rid of redundant instructions in ScalarQuantizer (#3430)
- PowerPC, improve code generation for function fvec_L2sqr (#3416)
- Unroll loop in lookup_2_lanes (#3364)
- Improve filtering & search parameters propagation (#3304)
- Change index_cpu_to_gpu to throw for indices not implemented on GPU (#3336)
- Throw when attempting to move IndexPQ to GPU (#3328)
- Skip HNSWPQ sdc init with new io flag (#3250)
### Fixed
- FIx a bug for a non-simdlib code of ResidualQuantizer (#3868)
- assign_index should default to null (#3855)
- Fix an incorrectly counted the number of computed distances for HNSW (#3840)
- Add error for overflowing nbits during PQ construction (#3833)
- Fix radius search with HSNW and IP (#3698)
- fix algorithm of spreading vectors over shards (#3374)
- Fix IndexBinary.assign Python method (#3384)
- Few fixes in bench_fw to enable IndexFromCodec (#3383)
- Fix the endianness issue in AIX while running the benchmark. (#3345)
- Fix faiss swig build with version > 4.2.x (#3315)
- Fix problems when using 64-bit integers. (#3322)
- Fix IVFPQFastScan decode function (#3312)
- Handling FaissException in few destructors of ResultHandler.h (#3311)
- Fix HNSW stats (#3309)
- AIX compilation fix for io classes (#3275)
## [1.8.0] - 2024-02-27
### Added
- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
- Added a context parameter to InvertedLists and InvertedListsIterator
- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
- Introduced Offline IVF framework powered by Faiss big batch search
- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
- Generalized ResultHandler and supported range search for HNSW and FastScan
- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
- Supported large two-level clustering
- Added support for Python 3.11 and 3.12
- Added support for CUDA 12
### Changed
- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
- Splitted off RQ encoding steps to another file
- Supported better NaN handling
- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
### Fixed
- Fixed DeviceVector reallocations in Faiss GPU
- Used efSearch from params if provided in HNSW search
- Fixed warp synchronous behavior in Faiss GPU CUDA 12
## [1.7.4] - 2023-04-12
### Added
- Added big batch IVF search for conducting efficient search with big batches of queries
- Checkpointing in big batch search support
- Precomputed centroids support
- Support for iterable inverted lists for eg. key value stores
- 64-bit indexing arithmetic support in FAISS GPU
- IndexIVFShards now handle IVF indexes with a common quantizer
- Jaccard distance support
- CodePacker for non-contiguous code layouts
- Approximate evaluation of top-k distances for ResidualQuantizer and IndexBinaryFlat
- Added support for 12-bit PQ / IVFPQ fine quantizer decoders for standalone vector codecs (faiss/cppcontrib)
- Conda packages for osx-arm64 (Apple M1) and linux-aarch64 (ARM64) architectures
- Support for Python 3.10
### Removed
- CUDA 10 is no longer supported in precompiled packages
- Removed Python 3.7 support for precompiled packages
- Removed constraint for using fine quantizer with no greater than 8 bits for IVFPQ, for example, now it is possible to use IVF256,PQ10x12 for a CPU index
### Changed
- Various performance optimizations for PQ / IVFPQ for AVX2 and ARM for training (fused distance+nearest kernel), search (faster kernels for distance_to_code() and scan_list_*()) and vector encoding
- A magnitude faster CPU code for LSQ/PLSQ training and vector encoding (reworked code)
- Performance improvements for Hamming Code computations for AVX2 and ARM (reworked code)
- Improved auto-vectorization support for IP and L2 distance computations (better handling of pragmas)
- Improved ResidualQuantizer vector encoding (pooling memory allocations, avoid r/w to a temporary buffer)
### Fixed
- HSNW bug fixed which improves the recall rate! Special thanks to zh Wang @hhy3 for this.
- Faiss GPU IVF large query batch fix
- Faiss + Torch fixes, re-enable k = 2048
- Fix the number of distance computations to match max_codes parameter
- Fix decoding of large fast_scan blocks
## [1.7.3] - 2022-11-3
### Added
- Added sparse k-means routines and moved the generic kmeans to contrib
- Added FlatDistanceComputer for all FlatCodes indexes
- Support for fast accumulation of 4-bit LSQ and RQ
- Added product additive quantization
- Support per-query search parameters for many indexes + filtering by ids
- write_VectorTransform and read_vectorTransform were added to the public API (by @AbdelrahmanElmeniawy)
- Support for IDMap2 in index_factory by adding "IDMap2" to prefix or suffix of the input String (by @AbdelrahmanElmeniawy)
- Support for merging all IndexFlatCodes descendants (by @AbdelrahmanElmeniawy)
- Remove and merge features for IndexFastScan (by @AbdelrahmanElmeniawy)
- Performance improvements: 1) specialized the AVX2 pieces of code speeding up certain hotspots, 2) specialized kernels for vector codecs (this can be found in faiss/cppcontrib)
### Fixed
- Fixed memory leak in OnDiskInvertedLists::do_mmap when the file is not closed (by @AbdelrahmanElmeniawy)
- LSH correctly throws error for metric types other than METRIC_L2 (by @AbdelrahmanElmeniawy)
## [1.7.2] - 2021-12-15
### Added
- Support LSQ on GPU (by @KinglittleQ)
- Support for exact 1D kmeans (by @KinglittleQ)
## [1.7.1] - 2021-05-27
### Added
- Support for building C bindings through the `FAISS_ENABLE_C_API` CMake option.
- Serializing the indexes with the python pickle module
- Support for the NNDescent k-NN graph building method (by @KinglittleQ)
- Support for the NSG graph indexing method (by @KinglittleQ)
- Residual quantizers: support as codec and unoptimized search
- Support for 4-bit PQ implementation for ARM (by @vorj, @n-miyamoto-fixstars, @LWisteria, and @matsui528)
- Implementation of Local Search Quantization (by @KinglittleQ)
### Changed
- The order of xb an xq was different between `faiss.knn` and `faiss.knn_gpu`.
Also the metric argument was called distance_type.
- The typed vectors (LongVector, LongLongVector, etc.) of the SWIG interface have
been deprecated. They have been replaced with Int32Vector, Int64Vector, etc. (by h-vetinari)
### Fixed
- Fixed a bug causing kNN search functions for IndexBinaryHash and
IndexBinaryMultiHash to return results in a random order.
- Copy constructor of AlignedTable had a bug leading to crashes when cloning
IVFPQ indices.
## [1.7.0] - 2021-01-27
## [1.6.5] - 2020-11-22
## [1.6.4] - 2020-10-12
### Added
- Arbitrary dimensions per sub-quantizer now allowed for `GpuIndexIVFPQ`.
- Brute-force kNN on GPU (`bfKnn`) now accepts `int32` indices.
- Nightly conda builds now available (for CPU).
- Faiss is now supported on Windows.
## [1.6.3] - 2020-03-24
### Added
- Support alternative distances on GPU for GpuIndexFlat, including L1, Linf and
Lp metrics.
- Support METRIC_INNER_PRODUCT for GpuIndexIVFPQ.
- Support float16 coarse quantizer for GpuIndexIVFFlat and GpuIndexIVFPQ. GPU
Tensor Core operations (mixed-precision arithmetic) are enabled on supported
hardware when operating with float16 data.
- Support k-means clustering with encoded vectors. This makes it possible to
train on larger datasets without decompressing them in RAM, and is especially
useful for binary datasets (see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py#L92).
- Support weighted k-means. Weights can be associated to each training point
(see https://github.com/facebookresearch/faiss/blob/main/tests/test_build_blocks.py).
- Serialize callback in python, to write to pipes or sockets (see
https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning).
- Reconstruct arbitrary ids from IndexIVF + efficient remove of a small number
of ids. This avoids 2 inefficiencies: O(ntotal) removal of vectors and
IndexIDMap2 on top of indexIVF. Documentation here:
https://github.com/facebookresearch/faiss/wiki/Special-operations-on-indexes.
- Support inner product as a metric in IndexHNSW (see
https://github.com/facebookresearch/faiss/blob/main/tests/test_index.py#L490).
- Support PQ of sizes other than 8 bit in IndexIVFPQ.
- Demo on how to perform searches sequentially on an IVF index. This is useful
for an OnDisk index with a very large batch of queries. In that case, it is
worthwhile to scan the index sequentially (see
https://github.com/facebookresearch/faiss/blob/main/tests/test_ivflib.py#L62).
- Range search support for most binary indexes.
- Support for hashing-based binary indexes (see
https://github.com/facebookresearch/faiss/wiki/Binary-indexes).
### Changed
- Replaced obj table in Clustering object: now it is a ClusteringIterationStats
structure that contains additional statistics.
### Removed
- Removed support for useFloat16Accumulator for accumulators on GPU (all
accumulations are now done in float32, regardless of whether float16 or float32
input data is used).
### Fixed
- Some python3 fixes in benchmarks.
- Fixed GpuCloner (some fields were not copied, default to no precomputed tables
with IndexIVFPQ).
- Fixed support for new pytorch versions.
- Serialization bug with alternative distances.
- Removed test on multiple-of-4 dimensions when switching between blas and AVX
implementations.
## [1.6.2] - 2020-03-10
## [1.6.1] - 2019-12-04
## [1.6.0] - 2019-09-24
### Added
- Faiss as a codec: We introduce a new API within Faiss to encode fixed-size
vectors into fixed-size codes. The encoding is lossy and the tradeoff between
compression and reconstruction accuracy can be adjusted.
- ScalarQuantizer support for GPU, see gpu/GpuIndexIVFScalarQuantizer.h. This is
particularly useful as GPU memory is often less abundant than CPU.
- Added easy-to-use serialization functions for indexes to byte arrays in Python
(faiss.serialize_index, faiss.deserialize_index).
- The Python KMeans object can be used to use the GPU directly, just add
gpu=True to the constuctor see gpu/test/test_gpu_index.py test TestGPUKmeans.
### Changed
- Change in the code layout: many C++ sources are now in subdirectories impl/
and utils/.
## [1.5.3] - 2019-06-24
### Added
- Basic support for 6 new metrics in CPU IndexFlat and IndexHNSW (https://github.com/facebookresearch/faiss/issues/848).
- Support for IndexIDMap/IndexIDMap2 with binary indexes (https://github.com/facebookresearch/faiss/issues/780).
### Changed
- Throw python exception for OOM (https://github.com/facebookresearch/faiss/issues/758).
- Make DistanceComputer available for all random access indexes.
- Gradually moving from long to uint64_t for portability.
### Fixed
- Slow scanning of inverted lists (https://github.com/facebookresearch/faiss/issues/836).
## [1.5.2] - 2019-05-28
### Added
- Support for searching several inverted lists in parallel (parallel_mode != 0).
- Better support for PQ codes where nbit != 8 or 16.
- IVFSpectralHash implementation: spectral hash codes inside an IVF.
- 6-bit per component scalar quantizer (4 and 8 bit were already supported).
- Combinations of inverted lists: HStackInvertedLists and VStackInvertedLists.
- Configurable number of threads for OnDiskInvertedLists prefetching (including
0=no prefetch).
- More test and demo code compatible with Python 3 (print with parentheses).
### Changed
- License was changed from BSD+Patents to MIT.
- Exceptions raised in sub-indexes of IndexShards and IndexReplicas are now
propagated.
- Refactored benchmark code: data loading is now in a single file.
## [1.5.1] - 2019-04-05
### Added
- MatrixStats object, which reports useful statistics about a dataset.
- Option to round coordinates during k-means optimization.
- An alternative option for search in HNSW.
- Support for range search in IVFScalarQuantizer.
- Support for direct uint_8 codec in ScalarQuantizer.
- Better support for PQ code assignment with external index.
- Support for IMI2x16 (4B virtual centroids).
- Support for k = 2048 search on GPU (instead of 1024).
- Support for renaming an ondisk invertedlists.
- Support for nterrupting computations with interrupt signal (ctrl-C) in python.
- Simplified build system (with --with-cuda/--with-cuda-arch options).
### Changed
- Moved stats() and imbalance_factor() from IndexIVF to InvertedLists object.
- Renamed IndexProxy to IndexReplicas.
- Most CUDA mem alloc failures now throw exceptions instead of terminating on an
assertion.
- Updated example Dockerfile.
- Conda packages now depend on the cudatoolkit packages, which fixes some
interferences with pytorch. Consequentially, faiss-gpu should now be installed
by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
## [1.5.0] - 2018-12-19
### Added
- New GpuIndexBinaryFlat index.
- New IndexBinaryHNSW index.
## [1.4.0] - 2018-08-30
### Added
- Automatic tracking of C++ references in Python.
- Support for non-intel platforms, some functions optimized for ARM.
- Support for overriding nprobe for concurrent searches.
- Support for floating-point quantizers in binary indices.
### Fixed
- No more segfaults due to Python's GC.
- GpuIndexIVFFlat issues for float32 with 64 / 128 dims.
- Sharding of flat indexes on GPU with index_cpu_to_gpu_multiple.
## [1.3.0] - 2018-07-10
### Added
- Support for binary indexes (IndexBinaryFlat, IndexBinaryIVF).
- Support fp16 encoding in scalar quantizer.
- Support for deduplication in IndexIVFFlat.
- Support for index serialization.
### Fixed
- MMAP bug for normal indices.
- Propagation of io_flags in read func.
- k-selection for CUDA 9.
- Race condition in OnDiskInvertedLists.
## [1.2.1] - 2018-02-28
### Added
- Support for on-disk storage of IndexIVF data.
- C bindings.
- Extended tutorial to GPU indices.
[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.9.0...HEAD
[1.9.0]: https://github.com/facebookresearch/faiss/compare/v1.8.0...v1.9.0
[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
[1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
[1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
[1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
[1.7.1]: https://github.com/facebookresearch/faiss/compare/v1.7.0...v1.7.1
[1.7.0]: https://github.com/facebookresearch/faiss/compare/v1.6.5...v1.7.0
[1.6.5]: https://github.com/facebookresearch/faiss/compare/v1.6.4...v1.6.5
[1.6.4]: https://github.com/facebookresearch/faiss/compare/v1.6.3...v1.6.4
[1.6.3]: https://github.com/facebookresearch/faiss/compare/v1.6.2...v1.6.3
[1.6.2]: https://github.com/facebookresearch/faiss/compare/v1.6.1...v1.6.2
[1.6.1]: https://github.com/facebookresearch/faiss/compare/v1.6.0...v1.6.1
[1.6.0]: https://github.com/facebookresearch/faiss/compare/v1.5.3...v1.6.0
[1.5.3]: https://github.com/facebookresearch/faiss/compare/v1.5.2...v1.5.3
[1.5.2]: https://github.com/facebookresearch/faiss/compare/v1.5.1...v1.5.2
[1.5.1]: https://github.com/facebookresearch/faiss/compare/v1.5.0...v1.5.1
[1.5.0]: https://github.com/facebookresearch/faiss/compare/v1.4.0...v1.5.0
[1.4.0]: https://github.com/facebookresearch/faiss/compare/v1.3.0...v1.4.0
[1.3.0]: https://github.com/facebookresearch/faiss/compare/v1.2.1...v1.3.0
[1.2.1]: https://github.com/facebookresearch/faiss/releases/tag/v1.2.1

View File

@@ -0,0 +1,126 @@
# @lint-ignore-every LICENSELINT
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
# =============================================================================
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================
cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
set(FAISS_LANGUAGES CXX)
if(FAISS_ENABLE_GPU)
if (FAISS_ENABLE_ROCM)
list(APPEND FAISS_LANGUAGES HIP)
list(PREPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake")
list(PREPEND CMAKE_PREFIX_PATH "/opt/rocm")
else()
list(APPEND FAISS_LANGUAGES CUDA)
endif()
endif()
if(FAISS_ENABLE_CUVS)
include(cmake/thirdparty/fetch_rapids.cmake)
include(rapids-cmake)
include(rapids-cpm)
include(rapids-cuda)
include(rapids-export)
include(rapids-find)
rapids_cuda_init_architectures(faiss)
rapids_cuda_init_architectures(pyfaiss)
rapids_cuda_init_architectures(faiss_c_library)
endif()
project(faiss
VERSION 1.10.0
DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
LANGUAGES ${FAISS_LANGUAGES})
include(GNUInstallDirs)
set(CMAKE_CXX_STANDARD 17)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
# Valid values are "generic", "avx2", "avx512", "avx512_spr", "sve".
option(FAISS_OPT_LEVEL "" "generic")
option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
option(FAISS_ENABLE_ROCM "Enable ROCm for GPU indexes." OFF)
option(FAISS_ENABLE_MKL "Enable MKL." ON)
option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
option(FAISS_ENABLE_C_API "Build C API." OFF)
option(FAISS_ENABLE_EXTRAS "Build extras like benchmarks and demos" ON)
option(FAISS_USE_LTO "Enable Link-Time optimization" OFF)
if(FAISS_ENABLE_GPU)
if(FAISS_ENABLE_ROCM)
enable_language(HIP)
add_definitions(-DUSE_AMD_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
set(GPU_EXT_PREFIX "hip")
execute_process(COMMAND ${PROJECT_SOURCE_DIR}/faiss/gpu/hipify.sh)
else ()
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
set(GPU_EXT_PREFIX "cu")
endif()
endif()
if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
find_package(cuvs)
endif()
add_subdirectory(faiss)
if(FAISS_ENABLE_GPU)
if(FAISS_ENABLE_ROCM)
add_subdirectory(faiss/gpu-rocm)
else()
add_subdirectory(faiss/gpu)
endif()
endif()
if(FAISS_ENABLE_PYTHON)
add_subdirectory(faiss/python)
endif()
if(FAISS_ENABLE_C_API)
add_subdirectory(c_api)
endif()
if(FAISS_ENABLE_EXTRAS)
add_subdirectory(demos)
add_subdirectory(benchs)
add_subdirectory(tutorial/cpp)
endif()
# CTest must be included in the top level to enable `make test` target.
include(CTest)
if(BUILD_TESTING)
add_subdirectory(tests)
add_subdirectory(perf_tests)
if(FAISS_ENABLE_GPU)
if(FAISS_ENABLE_ROCM)
add_subdirectory(faiss/gpu-rocm/test)
else()
add_subdirectory(faiss/gpu/test)
endif()
endif()
endif()

View File

@@ -0,0 +1,2 @@
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.

View File

@@ -0,0 +1,52 @@
# Contributing to Faiss
We want to make contributing to this project as easy and transparent as
possible.
## Our Development Process
We mainly develop Faiss within Facebook. Sometimes, we will sync the
github version of Faiss with the internal state.
## Pull Requests
We welcome pull requests that add significant value to Faiss. If you plan to do
a major development and contribute it back to Faiss, please contact us first before
putting too much effort into it.
1. Fork the repo and create your branch from `main`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
There is a Facebook internal test suite for Faiss, and we need to run
all changes to Faiss through it.
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
* 4 spaces for indentation in C++ (no tabs)
* 80 character line length (both for C++ and Python)
* C++ language level: C++17
## License
By contributing to Faiss, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,325 @@
# Installing Faiss via conda
The supported way to install Faiss is through [conda](https://docs.conda.io).
Stable releases are pushed regularly to the pytorch conda channel, as well as
pre-release nightly builds.
- The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
- faiss-gpu-cuvs [^1] package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 24.12, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4.
To install the latest stable release:
``` shell
# CPU-only version
$ conda install -c pytorch faiss-cpu=1.10.0
# GPU(+CPU) version
$ conda install -c pytorch -c nvidia faiss-gpu=1.10.0
# GPU(+CPU) version with NVIDIA cuVS
$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.10.0
# GPU(+CPU) version using AMD ROCm not yet available
```
For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
For faiss-gpu-cuvs, the rapidsai, conda-forge and nvidia channels are required.
Nightly pre-release packages can be installed as follows:
``` shell
# CPU-only version
$ conda install -c pytorch/label/nightly faiss-cpu
# GPU(+CPU) version
$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.10.0
# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4)
conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8)
conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
# GPU(+CPU) version using AMD ROCm not yet available
```
In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if its required.
A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
```
conda create --name faiss_1.8.0
conda activate faiss_1.8.0
conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
```
## Installing from conda-forge
Faiss is also being packaged by [conda-forge](https://conda-forge.org/), the
community-driven packaging ecosystem for conda. The packaging effort is
collaborating with the Faiss team to ensure high-quality package builds.
Due to the comprehensive infrastructure of conda-forge, it may even happen that
certain build combinations are supported in conda-forge that are not available
through the pytorch channel. To install, use
``` shell
# CPU version
$ conda install -c conda-forge faiss-cpu
# GPU version
$ conda install -c conda-forge faiss-gpu
# NVIDIA cuVS and AMD ROCm version not yet available
```
You can tell which channel your conda packages come from by using `conda list`.
If you are having problems using a package built by conda-forge, please raise
an [issue](https://github.com/conda-forge/faiss-split-feedstock/issues) on the
conda-forge package "feedstock".
# Building from source
Faiss can be built from source using CMake.
Faiss is supported on x86-64 machines on Linux, OSX, and Windows. It has been
found to run on other platforms as well, see
[other platforms](https://github.com/facebookresearch/faiss/wiki/Related-projects#bindings-to-other-languages-and-porting-to-other-platforms).
The basic requirements are:
- a C++17 compiler (with support for OpenMP support version 2 or higher),
- a BLAS implementation (on Intel machines we strongly recommend using Intel MKL for best
performance).
The optional requirements are:
- for GPU indices:
- nvcc,
- the CUDA toolkit,
- for AMD GPUs:
- AMD ROCm,
- for using NVIDIA cuVS implementations:
- libcuvs=24.12
- for the python bindings:
- python 3,
- numpy,
- and swig.
Indications for specific configurations are available in the [troubleshooting
section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshooting).
### Building with NVIDIA cuVS
[cuVS](https://docs.rapids.ai/api/cuvs/nightly/) contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It is built on top of the [RAPIDS RAFT](https://github.com/rapidsai/raft) library of high performance machine learning primitives. Building Faiss with cuVS enabled allows a user to choose between regular GPU implementations in Faiss and cuVS implementations for specific algorithms.
The libcuvs dependency should be installed via conda:
1. With CUDA 12.0 - 12.5:
```
conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5'
```
2. With CUDA 11.4 - 11.8
```
conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=11.4,<=11.8'
```
For more ways to install cuVS 24.12, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
## Step 1: invoking CMake
``` shell
$ cmake -B build .
```
This generates the system-dependent configuration/build files in the `build/`
subdirectory.
Several options can be passed to CMake, among which:
- general options:
- `-DFAISS_ENABLE_GPU=OFF` in order to disable building GPU indices (possible
values are `ON` and `OFF`),
- `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
(possible values are `ON` and `OFF`),
- `-DFAISS_ENABLE_CUVS=ON` in order to use the NVIDIA cuVS implementations
of the IVF-Flat, IVF-PQ and [CAGRA](https://arxiv.org/pdf/2308.15136) GPU-accelerated indices (default is `OFF`, possible, values are `ON` and `OFF`).
Note: `-DFAISS_ENABLE_GPU` must be set to `ON` when enabling this option.
- `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
- `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
are `ON` and `OFF`),
- `-DFAISS_ENABLE_C_API=ON` in order to enable building [C API](c_api/INSTALL.md) (possible values
are `ON` and `OFF`),
- optimization-related options:
- `-DCMAKE_BUILD_TYPE=Release` in order to enable generic compiler
optimization options (enables `-O3` on gcc for instance),
- `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to
generate code using optimized SIMD/Vector instructions. Possible values are below:
- On x86-64, `generic`, `avx2`, 'avx512', and `avx512_spr` (for avx512 features available since Intel(R) Sapphire Rapids), by increasing order of optimization,
- On aarch64, `generic` and `sve`, by increasing order of optimization,
- `-DFAISS_USE_LTO=ON` in order to enable [Link-Time Optimization](https://en.wikipedia.org/wiki/Link-time_optimization) (default is `OFF`, possible values are `ON` and `OFF`).
- BLAS-related options:
- `-DBLA_VENDOR=Intel10_64_dyn -DMKL_LIBRARIES=/path/to/mkl/libs` to use the
Intel MKL BLAS implementation, which is significantly faster than OpenBLAS
(more information about the values for the `BLA_VENDOR` option can be found in
the [CMake docs](https://cmake.org/cmake/help/latest/module/FindBLAS.html)),
- GPU-related options:
- `-DCUDAToolkit_ROOT=/path/to/cuda-10.1` in order to hint to the path of
the CUDA toolkit (for more information, see
[CMake docs](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)),
- `-DCMAKE_CUDA_ARCHITECTURES="75;72"` for specifying which GPU architectures
to build against (see [CUDA docs](https://developer.nvidia.com/cuda-gpus) to
determine which architecture(s) you should pick),
- `-DFAISS_ENABLE_ROCM=ON` in order to enable building GPU indices for AMD GPUs.
`-DFAISS_ENABLE_GPU` must be `ON` when using this option. (possible values are `ON` and `OFF`),
- python-related options:
- `-DPython_EXECUTABLE=/path/to/python3.7` in order to build a python
interface for a different python than the default one (see
[CMake docs](https://cmake.org/cmake/help/latest/module/FindPython.html)).
## Step 2: Invoking Make
``` shell
$ make -C build -j faiss
```
This builds the C++ library (`libfaiss.a` by default, and `libfaiss.so` if
`-DBUILD_SHARED_LIBS=ON` was passed to CMake).
The `-j` option enables parallel compilation of multiple units, leading to a
faster build, but increasing the chances of running out of memory, in which case
it is recommended to set the `-j` option to a fixed value (such as `-j4`).
If making use of optimization options, build the correct target before swigfaiss.
For AVX2:
``` shell
$ make -C build -j faiss_avx2
```
For AVX512:
``` shell
$ make -C build -j faiss_avx512
```
For AVX512 features available since Intel(R) Sapphire Rapids.
``` shell
$ make -C build -j faiss_avx512_spr
```
This will ensure the creation of neccesary files when building and installing the python package.
## Step 3: Building the python bindings (optional)
``` shell
$ make -C build -j swigfaiss
$ (cd build/faiss/python && python setup.py install)
```
The first command builds the python bindings for Faiss, while the second one
generates and installs the python package.
## Step 4: Installing the C++ library and headers (optional)
``` shell
$ make -C build install
```
This will make the compiled library (either `libfaiss.a` or `libfaiss.so` on
Linux) available system-wide, as well as the C++ headers. This step is not
needed to install the python package only.
## Step 5: Testing (optional)
### Running the C++ test suite
To run the whole test suite, make sure that `cmake` was invoked with
`-DBUILD_TESTING=ON`, and run:
``` shell
$ make -C build test
```
### Running the python test suite
``` shell
$ (cd build/faiss/python && python setup.py build)
$ PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)" pytest tests/test_*.py
```
### Basic example
A basic usage example is available in
[`demos/demo_ivfpq_indexing.cpp`](https://github.com/facebookresearch/faiss/blob/main/demos/demo_ivfpq_indexing.cpp).
It creates a small index, stores it and performs some searches. A normal runtime
is around 20s. With a fast machine and Intel MKL's BLAS it runs in 2.5s.
It can be built with
``` shell
$ make -C build demo_ivfpq_indexing
```
and subsequently ran with
``` shell
$ ./build/demos/demo_ivfpq_indexing
```
### Basic GPU example
``` shell
$ make -C build demo_ivfpq_indexing_gpu
$ ./build/demos/demo_ivfpq_indexing_gpu
```
This produce the GPU code equivalent to the CPU `demo_ivfpq_indexing`. It also
shows how to translate indexes from/to a GPU.
### A real-life benchmark
A longer example runs and evaluates Faiss on the SIFT1M dataset. To run it,
please download the ANN_SIFT1M dataset from http://corpus-texmex.irisa.fr/
and unzip it to the subdirectory `sift1M` at the root of the source
directory for this repository.
Then compile and run the following (after ensuring you have installed faiss):
``` shell
$ make -C build demo_sift1M
$ ./build/demos/demo_sift1M
```
This is a demonstration of the high-level auto-tuning API. You can try
setting a different index_key to find the indexing structure that
gives the best performance.
### Real-life test
The following script extends the demo_sift1M test to several types of
indexes. This must be run from the root of the source directory for this
repository:
``` shell
$ mkdir tmp # graphs of the output will be written here
$ python demos/demo_auto_tune.py
```
It will cycle through a few types of indexes and find optimal
operating points. You can play around with the types of indexes.
### Real-life test on GPU
The example above also runs on GPU. Edit `demos/demo_auto_tune.py` at line 100
with the values
``` python
keys_to_test = keys_gpu
use_gpu = True
```
and you can run
``` shell
$ python demos/demo_auto_tune.py
```
to test the GPU code.
[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary.

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,92 @@
# Faiss
Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at Meta's [Fundamental AI Research](https://ai.facebook.com/) group.
## News
See [CHANGELOG.md](CHANGELOG.md) for detailed information about latest features.
## Introduction
Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
Some of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. Other methods, like HNSW and NSG add an indexing structure on top of the raw vectors to make searching more efficient.
The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
## Installing
Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu), [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu) and [faiss-gpu-cuvs](https://anaconda.org/pytorch/faiss-gpu-cuvs). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. The backend GPU implementations of NVIDIA [cuVS](https://github.com/rapidsai/cuvs) can also be enabled optionally. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
## How Faiss works
Faiss is built around an index type that stores a set of vectors, and provides a function to search in them with L2 and/or dot product vector comparison. Some index types are simple baselines, such as exact search. Most of the available indexing structures correspond to various trade-offs with respect to
- search time
- search quality
- memory used per index vector
- training time
- adding time
- need for external data for unsupervised training
The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
## Full documentation of Faiss
The following are entry points for documentation:
- the full documentation can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting)
- the [doxygen documentation](https://faiss.ai/) gives per-class information extracted from code comments
- to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
## Authors
The main authors of Faiss are:
- [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
- [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
- [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
- [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
- [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
- [Gergely Szilvasy](https://github.com/algoriddle) build system, benchmarking framework.
## Reference
References to cite when you use Faiss in a research paper:
```
@article{douze2024faiss,
title={The Faiss library},
author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazaré and Maria Lomeli and Lucas Hosseini and Hervé Jégou},
year={2024},
eprint={2401.08281},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
```
For the GPU version of Faiss, please cite:
```
@article{johnson2019billion,
title={Billion-scale similarity search with {GPUs}},
author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
journal={IEEE Transactions on Big Data},
volume={7},
number={3},
pages={535--547},
year={2019},
publisher={IEEE}
}
```
## Join the Faiss community
For public discussion of Faiss or for questions, visit https://github.com/facebookresearch/faiss/discussions.
We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
You can report bugs, ask questions, etc.
## Legal
Faiss is MIT-licensed, refer to the [LICENSE file](https://github.com/facebookresearch/faiss/blob/main/LICENSE) in the top level directory.
Copyright © Meta Platforms, Inc.

View File

@@ -0,0 +1,10 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
target_link_libraries(bench_ivf_selector PRIVATE faiss)

View File

@@ -0,0 +1,361 @@
# Benchmarking scripts
This directory contains benchmarking scripts that can reproduce the
numbers reported in the two papers
```
@inproceedings{DJP16,
Author = {Douze, Matthijs and J{\'e}gou, Herv{\'e} and Perronnin, Florent},
Booktitle = "ECCV",
Organization = {Springer},
Title = {Polysemous codes},
Year = {2016}
}
```
and
```
@inproceedings{JDJ17,
Author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
journal= {arXiv:1702.08734},,
Title = {Billion-scale similarity search with GPUs},
Year = {2017},
}
```
Note that the numbers (especially timings) change slightly due to changes in the implementation, different machines, etc.
The scripts are self-contained. They depend only on Faiss and external training data that should be stored in sub-directories.
## SIFT1M experiments
The script [`bench_polysemous_sift1m.py`](bench_polysemous_sift1m.py) reproduces the numbers in
Figure 3 from the "Polysemous" paper.
### Getting SIFT1M
To run it, please download the ANN_SIFT1M dataset from
http://corpus-texmex.irisa.fr/
and unzip it to the subdirectory sift1M.
### Result
The output looks like:
```
PQ training on 100000 points, remains 0 points: training polysemous on centroids
add vectors to index
PQ baseline 7.517 ms per query, R@1 0.4474
Polysemous 64 9.875 ms per query, R@1 0.4474
Polysemous 62 8.358 ms per query, R@1 0.4474
Polysemous 58 5.531 ms per query, R@1 0.4474
Polysemous 54 3.420 ms per query, R@1 0.4478
Polysemous 50 2.182 ms per query, R@1 0.4475
Polysemous 46 1.621 ms per query, R@1 0.4408
Polysemous 42 1.448 ms per query, R@1 0.4174
Polysemous 38 1.331 ms per query, R@1 0.3563
Polysemous 34 1.334 ms per query, R@1 0.2661
Polysemous 30 1.272 ms per query, R@1 0.1794
```
## Experiments on 1B elements dataset
The script [`bench_polysemous_1bn.py`](bench_polysemous_1bn.py) reproduces a few experiments on
two datasets of size 1B from the Polysemous codes" paper.
### Getting BIGANN
Download the four files of ANN_SIFT1B from
http://corpus-texmex.irisa.fr/ to subdirectory bigann/
### Getting Deep1B
The ground-truth and queries are available here
https://yadi.sk/d/11eDCm7Dsn9GA
For the learning and database vectors, use the script
https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py
to download the data to subdirectory deep1b/, then concatenate the
database files to base.fvecs and the training files to learn.fvecs
### Running the experiments
These experiments are quite long. To support resuming, the script
stores the result of training to a temporary directory, `/tmp/bench_polysemous`.
The script `bench_polysemous_1bn.py` takes at least two arguments:
- the dataset name: SIFT1000M (aka SIFT1B, aka BIGANN) or Deep1B. SIFT1M, SIFT2M,... are also supported to make subsets of for small experiments (note that SIFT1M as a subset of SIFT1B is not the same as the SIFT1M above)
- the type of index to build, which should be a valid [index_factory key](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning#index-factory) (see below for examples)
- the remaining arguments are parsed as search-time parameters.
### Experiments of Table 2
The `IMI*+PolyD+ADC` results in Table 2 can be reproduced with (for 16 bytes):
```
python bench_polysemous_1bn.par SIFT1000M IMI2x12,PQ16 nprobe=16,max_codes={10000,30000},ht={44..54}
```
Training takes about 2 minutes and adding vectors to the dataset
takes 3.1 h. These operations are multithreaded. Note that in the command
above, we use bash's [brace expansion](https://www.gnu.org/software/bash/manual/html_node/Brace-Expansion.html) to set a grid of parameters.
The search is *not* multithreaded, and the output looks like:
```
R@1 R@10 R@100 time %pass
nprobe=16,max_codes=10000,ht=44 0.1779 0.2994 0.3139 0.194 12.45
nprobe=16,max_codes=10000,ht=45 0.1859 0.3183 0.3339 0.197 14.24
nprobe=16,max_codes=10000,ht=46 0.1930 0.3366 0.3543 0.202 16.22
nprobe=16,max_codes=10000,ht=47 0.1993 0.3550 0.3745 0.209 18.39
nprobe=16,max_codes=10000,ht=48 0.2033 0.3694 0.3917 0.640 20.77
nprobe=16,max_codes=10000,ht=49 0.2070 0.3839 0.4077 0.229 23.36
nprobe=16,max_codes=10000,ht=50 0.2101 0.3949 0.4205 0.232 26.17
nprobe=16,max_codes=10000,ht=51 0.2120 0.4042 0.4310 0.239 29.21
nprobe=16,max_codes=10000,ht=52 0.2134 0.4113 0.4402 0.245 32.47
nprobe=16,max_codes=10000,ht=53 0.2157 0.4184 0.4482 0.250 35.96
nprobe=16,max_codes=10000,ht=54 0.2170 0.4240 0.4546 0.256 39.66
nprobe=16,max_codes=30000,ht=44 0.1882 0.3327 0.3555 0.226 11.29
nprobe=16,max_codes=30000,ht=45 0.1964 0.3525 0.3771 0.231 13.05
nprobe=16,max_codes=30000,ht=46 0.2039 0.3713 0.3987 0.236 15.01
nprobe=16,max_codes=30000,ht=47 0.2103 0.3907 0.4202 0.245 17.19
nprobe=16,max_codes=30000,ht=48 0.2145 0.4055 0.4384 0.251 19.60
nprobe=16,max_codes=30000,ht=49 0.2179 0.4198 0.4550 0.257 22.25
nprobe=16,max_codes=30000,ht=50 0.2208 0.4305 0.4681 0.268 25.15
nprobe=16,max_codes=30000,ht=51 0.2227 0.4402 0.4791 0.275 28.30
nprobe=16,max_codes=30000,ht=52 0.2241 0.4473 0.4884 0.284 31.70
nprobe=16,max_codes=30000,ht=53 0.2265 0.4544 0.4965 0.294 35.34
nprobe=16,max_codes=30000,ht=54 0.2278 0.4601 0.5031 0.303 39.20
```
The result reported in table 2 is the one for which the %pass (percentage of code comparisons that pass the Hamming check) is around 20%, which occurs for Hamming threshold `ht=48`.
The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
### Experiments of the appendix
The experiments in the appendix are only in the ArXiv version of the paper (table 3).
```
python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
R@1 R@10 R@100 time %pass
nprobe=1,ht=20 0.0351 0.0616 0.0751 0.158 19.01
...
nprobe=32,ht=28 0.1256 0.3563 0.5026 0.561 52.61
...
```
Here again the runs are not exactly the same but the original result was obtained from nprobe=32,ht=28.
For Deep1B, we used a simple version of [auto-tuning](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning/_edit#auto-tuning-the-runtime-parameters) to sweep through the set of operating points:
```
python bench_polysemous_1bn.py Deep1B OPQ20_80,IMI2x14,PQ20 autotune
...
Done in 4067.555 s, available OPs:
Parameters 1-R@1 time
0.0000 0.000
nprobe=1,ht=22,max_codes=256 0.0215 3.115
nprobe=1,ht=30,max_codes=256 0.0381 3.120
...
nprobe=512,ht=68,max_codes=524288 0.4478 36.903
nprobe=1024,ht=80,max_codes=131072 0.4557 46.363
nprobe=1024,ht=78,max_codes=262144 0.4616 61.939
...
```
The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
## GPU experiments
The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss.
### Search on SIFT1M
See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers.
The output is:
```
============ Exact search
add vectors to index
warmup
benchmark
k=1 0.715 s, R@1 0.9914
k=2 0.729 s, R@1 0.9935
k=4 0.731 s, R@1 0.9935
k=8 0.732 s, R@1 0.9935
k=16 0.742 s, R@1 0.9935
k=32 0.737 s, R@1 0.9935
k=64 0.753 s, R@1 0.9935
k=128 0.761 s, R@1 0.9935
k=256 0.799 s, R@1 0.9935
k=512 0.975 s, R@1 0.9935
k=1024 1.424 s, R@1 0.9935
============ Approximate search
train
WARNING clustering 100000 points to 4096 centroids: please provide at least 159744 training points
add vectors to index
WARN: increase temp memory to avoid cudaMalloc, or decrease query/add size (alloc 256000000 B, highwater 256000000 B)
warmup
benchmark
nprobe= 1 0.043 s recalls= 0.3909 0.4312 0.4312
nprobe= 2 0.040 s recalls= 0.5041 0.5636 0.5636
nprobe= 4 0.048 s recalls= 0.6048 0.6897 0.6897
nprobe= 8 0.064 s recalls= 0.6879 0.8028 0.8028
nprobe= 16 0.088 s recalls= 0.7534 0.8940 0.8940
nprobe= 32 0.134 s recalls= 0.7957 0.9549 0.9550
nprobe= 64 0.224 s recalls= 0.8125 0.9833 0.9834
nprobe= 128 0.395 s recalls= 0.8205 0.9953 0.9954
nprobe= 256 0.717 s recalls= 0.8227 0.9993 0.9994
nprobe= 512 1.348 s recalls= 0.8228 0.9999 1.0000
```
The run produces two warnings:
- the clustering complains that it does not have enough training data, there is not much we can do about this.
- the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways.
To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives:
```
nprobe= 1 0.025 s recalls= 0.4084 0.4105 0.4105
nprobe= 2 0.033 s recalls= 0.5235 0.5264 0.5264
nprobe= 4 0.033 s recalls= 0.6332 0.6367 0.6367
nprobe= 8 0.040 s recalls= 0.7358 0.7403 0.7403
nprobe= 16 0.049 s recalls= 0.8273 0.8324 0.8324
nprobe= 32 0.068 s recalls= 0.8957 0.9024 0.9024
nprobe= 64 0.104 s recalls= 0.9477 0.9549 0.9549
nprobe= 128 0.174 s recalls= 0.9760 0.9837 0.9837
nprobe= 256 0.299 s recalls= 0.9866 0.9944 0.9944
nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
```
### Clustering on MNIST8m
To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output:
```
python kmeans_mnist.py 1 256
...
Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
Preprocessing in 7.94526 s
Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0
final objective: 1.449e+13
total runtime: 140.615 s
```
### search on SIFT1B
The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the two 1-billion vector datasets we considered. It is more complex than the previous scripts, because it supports many search options and decomposes the dataset build process in Python to exploit the best possible CPU/GPU parallelism and GPU distribution.
Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
The search results on SIFT1B in the "GPU paper" can be obtained with
<!-- see P57124181 -->
```
python bench_gpu_1bn.py SIFT1000M OPQ8_32,IVF262144,PQ8 -nnn 10 -ngpu 1 -tempmem $[1536*1024*1024]
...
0/10000 (0.024 s) probe=1 : 0.161 s 1-R@1: 0.0752 1-R@10: 0.1924
0/10000 (0.005 s) probe=2 : 0.150 s 1-R@1: 0.0964 1-R@10: 0.2693
0/10000 (0.005 s) probe=4 : 0.153 s 1-R@1: 0.1102 1-R@10: 0.3328
0/10000 (0.005 s) probe=8 : 0.170 s 1-R@1: 0.1220 1-R@10: 0.3827
0/10000 (0.005 s) probe=16 : 0.196 s 1-R@1: 0.1290 1-R@10: 0.4151
0/10000 (0.006 s) probe=32 : 0.244 s 1-R@1: 0.1314 1-R@10: 0.4345
0/10000 (0.006 s) probe=64 : 0.353 s 1-R@1: 0.1332 1-R@10: 0.4461
0/10000 (0.005 s) probe=128: 0.587 s 1-R@1: 0.1341 1-R@10: 0.4502
0/10000 (0.006 s) probe=256: 1.160 s 1-R@1: 0.1342 1-R@10: 0.4511
```
We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G, otherwise the dataset does not fit in GPU memory
### search on Deep1B
The same script generates the GPU search results on Deep1B.
```
python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
...
0/10000 (0.115 s) probe=1 : 0.239 s 1-R@1: 0.2387 1-R@10: 0.3420
0/10000 (0.006 s) probe=2 : 0.103 s 1-R@1: 0.3110 1-R@10: 0.4623
0/10000 (0.005 s) probe=4 : 0.105 s 1-R@1: 0.3772 1-R@10: 0.5862
0/10000 (0.005 s) probe=8 : 0.116 s 1-R@1: 0.4235 1-R@10: 0.6889
0/10000 (0.005 s) probe=16 : 0.133 s 1-R@1: 0.4517 1-R@10: 0.7693
0/10000 (0.005 s) probe=32 : 0.168 s 1-R@1: 0.4713 1-R@10: 0.8281
0/10000 (0.005 s) probe=64 : 0.238 s 1-R@1: 0.4841 1-R@10: 0.8649
0/10000 (0.007 s) probe=128: 0.384 s 1-R@1: 0.4900 1-R@10: 0.8816
0/10000 (0.005 s) probe=256: 0.736 s 1-R@1: 0.4933 1-R@10: 0.8912
```
Here we are a bit tight on memory so we disable precomputed tables (`-noptables`) and restrict the amount of temporary memory. The `-altadd` option avoids GPU memory overflows during add.
### knn-graph on Deep1B
The same script generates the KNN-graph on Deep1B. Note that the inverted file from above will not be re-used because the training sets are different. For the knngraph, the script will first do a pass over the whole dataset to compute the ground-truth knn for a subset of 10k nodes, for evaluation.
```
python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -altadd -knngraph -R 2 -noptables -tempmem $[1<<30] -ngpu 4
...
CPU index contains 1000000000 vectors, move to GPU
Copy CPU index to 2 sharded GPU indexes
dispatch to GPUs 0:2
IndexShards shard 0 indices 0:500000000
IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
IndexShards shard 1 indices 500000000:1000000000
IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
dispatch to GPUs 2:4
IndexShards shard 0 indices 0:500000000
IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
IndexShards shard 1 indices 500000000:1000000000
IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
move to GPU done in 151.535 s
search...
999997440/1000000000 (8389.961 s, 0.3379) probe=1 : 8389.990 s rank-10 intersection results: 0.3379
999997440/1000000000 (9205.934 s, 0.4079) probe=2 : 9205.966 s rank-10 intersection results: 0.4079
999997440/1000000000 (9741.095 s, 0.4722) probe=4 : 9741.128 s rank-10 intersection results: 0.4722
999997440/1000000000 (10830.420 s, 0.5256) probe=8 : 10830.455 s rank-10 intersection results: 0.5256
999997440/1000000000 (12531.716 s, 0.5603) probe=16 : 12531.758 s rank-10 intersection results: 0.5603
999997440/1000000000 (15922.519 s, 0.5825) probe=32 : 15922.571 s rank-10 intersection results: 0.5825
999997440/1000000000 (22774.153 s, 0.5950) probe=64 : 22774.220 s rank-10 intersection results: 0.5950
999997440/1000000000 (36717.207 s, 0.6015) probe=128: 36717.309 s rank-10 intersection results: 0.6015
999997440/1000000000 (70616.392 s, 0.6047) probe=256: 70616.581 s rank-10 intersection results: 0.6047
```
# Additional benchmarks
This directory also contains certain additional benchmarks (and serve as an additional source of examples of how to use the Faiss code).
Certain tests / benchmarks might be outdated.
* bench_6bit_codec.cpp - tests vector codecs for SQ6 quantization on a synthetic dataset
* bench_cppcontrib_sa_decode.cpp - benchmarks specialized kernels for vector codecs for PQ, IVFPQ and Resudial+PQ on a synthetic dataset
* bench_for_interrupt.py - evaluates the impact of the interrupt callback handler (which can be triggered from Python code)
* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
* bench_ivf_fastscan.py - compares IVF{nlist},PQ{M}x4fs against other indices on SIFT1M dataset
* bench_ivf_selector.cpp - checks the possible overhead when using faiss::IDSelectorAll interface
* bench_pairwise_distances.py - benchmarks pairwise distance computation between two synthetic datasets
* bench_partition.py - benchmarks partitioning functions
* bench_pq_tables.py - benchmarks ProductQuantizer.compute_inner_prod_tables() and ProductQuantizer.compute_distance_tables() calls
* bench_quantizer.py - benchmarks various quantizers for SIFT1M, Deep1B, BigANN datasets
* bench_scalar_quantizer.py - benchmarks IVF+SQ on a Sift1M dataset
* bench_vector_ops.py - benchmarks dot product and distances computations on a synthetic dataset

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <omp.h>
#include <cstdio>
#include <benchmark/benchmark.h>
#include <faiss/impl/ScalarQuantizer.h>
#include <faiss/utils/distances.h>
#include <faiss/utils/random.h>
#include <faiss/utils/utils.h>
using namespace faiss;
static void bench(benchmark::State& state) {
int d = 128;
int n = 2000;
state.SetLabel(faiss::get_compile_options());
std::vector<float> x(d * n);
float_rand(x.data(), d * n, 12345);
// make sure it's idempotent
ScalarQuantizer sq(d, ScalarQuantizer::QT_6bit);
omp_set_num_threads(1);
sq.train(n, x.data());
size_t code_size = sq.code_size;
state.counters["code_size"] = sq.code_size;
// encode
std::vector<uint8_t> codes(code_size * n);
sq.compute_codes(x.data(), codes.data(), n);
// decode
std::vector<float> x2(d * n);
sq.decode(codes.data(), x2.data(), n);
state.counters["sql2_recons_error"] =
fvec_L2sqr(x.data(), x2.data(), n * d) / n;
// encode again
std::vector<uint8_t> codes2(code_size * n);
sq.compute_codes(x2.data(), codes2.data(), n);
size_t ndiff = 0;
for (size_t i = 0; i < codes.size(); i++) {
if (codes[i] != codes2[i])
ndiff++;
}
state.counters["ndiff_for_idempotence"] = ndiff;
state.counters["code_size_two"] = codes.size();
std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
sq.get_distance_computer());
dc->codes = codes.data();
dc->code_size = sq.code_size;
state.counters["code_size_three"] = dc->code_size;
for (auto _ : state) {
float sum_dis = 0;
for (int i = 0; i < n; i++) {
dc->set_query(&x[i * d]);
for (int j = 0; j < n; j++) {
benchmark::DoNotOptimize(sum_dis += (*dc)(j));
}
}
}
}
// I think maybe n and d should be input arguments
// for things to really make sense, idk.
BENCHMARK(bench)->Iterations(20);
BENCHMARK_MAIN();

View File

@@ -0,0 +1,20 @@
# Benchmark of IVF variants
This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy.
The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
The code is organized as:
- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
- `bench_all_ivf.py`: evaluate one type of inverted file
- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices.
Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results.
The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies.
It was run in October 2018 for the results in the wiki.

View File

@@ -0,0 +1,567 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import os
import sys
import time
import json
import faiss
import numpy as np
try:
import datasets_fb as datasets
except ModuleNotFoundError:
import datasets_oss as datasets
sanitize = datasets.sanitize
def unwind_index_ivf(index):
if isinstance(index, faiss.IndexPreTransform):
assert index.chain.size() == 1
vt = index.chain.at(0)
index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
assert vt2 is None
if vt is None:
vt = lambda x: x
else:
vt = faiss.downcast_VectorTransform(vt)
return index_ivf, vt
if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
return unwind_index_ivf(faiss.downcast_index(index.base_index))
if isinstance(index, faiss.IndexIVF):
return index, None
else:
return None, None
def apply_AQ_options(index, args):
# if not(
# isinstance(index, faiss.IndexAdditiveQuantize) or
# isinstance(index, faiss.IndexIVFAdditiveQuantizer)):
# return
if args.RQ_train_default:
print("set default training for RQ")
index.rq.train_type
index.rq.train_type = faiss.ResidualQuantizer.Train_default
if args.RQ_beam_size != -1:
print("set RQ beam size to", args.RQ_beam_size)
index.rq.max_beam_size
index.rq.max_beam_size = args.RQ_beam_size
if args.LSQ_encode_ils_iters != -1:
print("set LSQ ils iterations to", args.LSQ_encode_ils_iters)
index.lsq.encode_ils_iters
index.lsq.encode_ils_iters = args.LSQ_encode_ils_iters
if args.RQ_use_beam_LUT != -1:
print("set RQ beam LUT to", args.RQ_use_beam_LUT)
index.rq.use_beam_LUT
index.rq.use_beam_LUT = args.RQ_use_beam_LUT
def eval_setting(index, xq, gt, k, inter, min_time):
""" evaluate searching in terms of precision vs. speed """
nq = xq.shape[0]
ivf_stats = faiss.cvar.indexIVF_stats
ivf_stats.reset()
nrun = 0
t0 = time.time()
while True:
D, I = index.search(xq, k)
nrun += 1
t1 = time.time()
if t1 - t0 > min_time:
break
ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
res = {
"ms_per_query": ms_per_query,
"nrun": nrun
}
res["n"] = ms_per_query
if inter:
rank = k
inter_measure = faiss.eval_intersection(gt[:, :rank], I[:, :rank]) / (nq * rank)
print("%.4f" % inter_measure, end=' ')
res["inter_measure"] = inter_measure
else:
res["recalls"] = {}
for rank in 1, 10, 100:
recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
print("%.4f" % recall, end=' ')
res["recalls"][rank] = recall
print(" %9.5f " % ms_per_query, end=' ')
print("%12d " % (ivf_stats.ndis / nrun), end=' ')
print(nrun)
res["ndis"] = ivf_stats.ndis / nrun
return res
######################################################
# Training
######################################################
def run_train(args, ds, res):
nq, d = ds.nq, ds.d
nb, d = ds.nq, ds.d
print("build index, key=", args.indexkey)
index = faiss.index_factory(
d, args.indexkey, faiss.METRIC_L2 if ds.metric == "L2" else
faiss.METRIC_INNER_PRODUCT
)
index_ivf, vec_transform = unwind_index_ivf(index)
if args.by_residual != -1:
by_residual = args.by_residual == 1
print("setting by_residual = ", by_residual)
index_ivf.by_residual # check if field exists
index_ivf.by_residual = by_residual
if index_ivf:
print("Update add-time parameters")
# adjust default parameters used at add time for quantizers
# because otherwise the assignment is inaccurate
quantizer = faiss.downcast_index(index_ivf.quantizer)
if isinstance(quantizer, faiss.IndexRefine):
print(" update quantizer k_factor=", quantizer.k_factor, end=" -> ")
quantizer.k_factor = 32 if index_ivf.nlist < 1e6 else 64
print(quantizer.k_factor)
base_index = faiss.downcast_index(quantizer.base_index)
if isinstance(base_index, faiss.IndexIVF):
print(" update quantizer nprobe=", base_index.nprobe, end=" -> ")
base_index.nprobe = (
16 if base_index.nlist < 1e5 else
32 if base_index.nlist < 4e6 else
64)
print(base_index.nprobe)
elif isinstance(quantizer, faiss.IndexHNSW):
hnsw = quantizer.hnsw
print(
f" update HNSW quantizer options, before: "
f"{hnsw.efSearch=:} {hnsw.efConstruction=:}"
)
hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
hnsw.efConstruction = 200
print(f" after: {hnsw.efSearch=:} {hnsw.efConstruction=:}")
apply_AQ_options(index_ivf or index, args)
if index_ivf:
index_ivf.verbose = True
index_ivf.quantizer.verbose = True
index_ivf.cp.verbose = True
else:
index.verbose = True
maxtrain = args.maxtrain
if maxtrain == 0:
if 'IMI' in args.indexkey:
maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
elif index_ivf:
maxtrain = 50 * index_ivf.nlist
else:
# just guess...
maxtrain = 256 * 100
maxtrain = max(maxtrain, 256 * 100)
print("setting maxtrain to %d" % maxtrain)
try:
xt2 = ds.get_train(maxtrain=maxtrain)
except NotImplementedError:
print("No training set: training on database")
xt2 = ds.get_database()[:maxtrain]
print("train, size", xt2.shape)
assert np.all(np.isfinite(xt2))
if (isinstance(vec_transform, faiss.OPQMatrix) and
isinstance(index_ivf, faiss.IndexIVFPQFastScan)):
print(" Forcing OPQ training PQ to PQ4")
ref_pq = index_ivf.pq
training_pq = faiss.ProductQuantizer(
ref_pq.d, ref_pq.M, ref_pq.nbits
)
vec_transform.pq
vec_transform.pq = training_pq
if args.get_centroids_from == '':
if args.clustering_niter >= 0:
print(("setting nb of clustering iterations to %d" %
args.clustering_niter))
index_ivf.cp.niter = args.clustering_niter
if args.train_on_gpu:
print("add a training index on GPU")
train_index = faiss.index_cpu_to_all_gpus(
faiss.IndexFlatL2(index_ivf.d))
index_ivf.clustering_index = train_index
else:
print("Getting centroids from", args.get_centroids_from)
src_index = faiss.read_index(args.get_centroids_from)
src_quant = faiss.downcast_index(src_index.quantizer)
centroids = src_quant.reconstruct_n()
print(" centroid table shape", centroids.shape)
if isinstance(vec_transform, faiss.VectorTransform):
print(" training vector transform")
vec_transform.train(xt2)
print(" transform centroids")
centroids = vec_transform.apply_py(centroids)
if not index_ivf.quantizer.is_trained:
print(" training quantizer")
index_ivf.quantizer.train(centroids)
print(" add centroids to quantizer")
index_ivf.quantizer.add(centroids)
del src_index
t0 = time.time()
index.train(xt2)
res.train_time = time.time() - t0
print(" train in %.3f s" % res.train_time)
return index
######################################################
# Populating index
######################################################
def run_add(args, ds, index, res):
print("adding")
t0 = time.time()
if args.add_bs == -1:
assert args.split == [1, 0], "split not supported with full batch add"
index.add(sanitize(ds.get_database()))
else:
totn = ds.nb // args.split[0] # approximate
i0 = 0
print(f"Adding in block sizes {args.add_bs} with split {args.split}")
for xblock in ds.database_iterator(bs=args.add_bs, split=args.split):
i1 = i0 + len(xblock)
print(" adding %d:%d / %d [%.3f s, RSS %d kiB] " % (
i0, i1, totn, time.time() - t0,
faiss.get_mem_usage_kb()))
index.add(xblock)
i0 = i1
res.t_add = time.time() - t0
print(f" add in {res.t_add:.3f} s index size {index.ntotal}")
######################################################
# Search
######################################################
def run_search(args, ds, index, res):
index_ivf, vec_transform = unwind_index_ivf(index)
if args.no_precomputed_tables:
if isinstance(index_ivf, faiss.IndexIVFPQ):
print("disabling precomputed table")
index_ivf.use_precomputed_table = -1
index_ivf.precomputed_table.clear()
if args.indexfile:
print("index size on disk: ", os.stat(args.indexfile).st_size)
if hasattr(index, "code_size"):
print("vector code_size", index.code_size)
if hasattr(index_ivf, "code_size"):
print("vector code_size (IVF)", index_ivf.code_size)
print("current RSS:", faiss.get_mem_usage_kb() * 1024)
precomputed_table_size = 0
if hasattr(index_ivf, 'precomputed_table'):
precomputed_table_size = index_ivf.precomputed_table.size() * 4
print("precomputed tables size:", precomputed_table_size)
# Index is ready
xq = sanitize(ds.get_queries())
nq, d = xq.shape
gt = ds.get_groundtruth(k=args.k)
if not args.accept_short_gt: # Deep1B has only a single NN per query
assert gt.shape[1] == args.k
if args.searchthreads != -1:
print("Setting nb of threads to", args.searchthreads)
faiss.omp_set_num_threads(args.searchthreads)
else:
print("nb search threads: ", faiss.omp_get_max_threads())
ps = faiss.ParameterSpace()
ps.initialize(index)
parametersets = args.searchparams
if args.inter:
header = (
'%-40s inter@%3d time(ms/q) nb distances #runs' %
("parameters", args.k)
)
else:
header = (
'%-40s R@1 R@10 R@100 time(ms/q) nb distances #runs' %
"parameters"
)
res.search_results = {}
if parametersets == ['autotune']:
ps.n_experiments = args.n_autotune
ps.min_test_duration = args.min_test_duration
for kv in args.autotune_max:
k, vmax = kv.split(':')
vmax = float(vmax)
print("limiting %s to %g" % (k, vmax))
pr = ps.add_range(k)
values = faiss.vector_to_array(pr.values)
values = np.array([v for v in values if v < vmax])
faiss.copy_array_to_vector(values, pr.values)
for kv in args.autotune_range:
k, vals = kv.split(':')
vals = np.fromstring(vals, sep=',')
print("setting %s to %s" % (k, vals))
pr = ps.add_range(k)
faiss.copy_array_to_vector(vals, pr.values)
# setup the Criterion object
if args.inter:
print("Optimize for intersection @ ", args.k)
crit = faiss.IntersectionCriterion(nq, args.k)
else:
print("Optimize for 1-recall @ 1")
crit = faiss.OneRecallAtRCriterion(nq, 1)
# by default, the criterion will request only 1 NN
crit.nnn = args.k
crit.set_groundtruth(None, gt.astype('int64'))
# then we let Faiss find the optimal parameters by itself
print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
ps.display()
t0 = time.time()
op = ps.explore(index, xq, crit)
res.t_explore = time.time() - t0
print("Done in %.3f s, available OPs:" % res.t_explore)
op.display()
print("Re-running evaluation on selected OPs")
print(header)
opv = op.optimal_pts
maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
for i in range(opv.size()):
opt = opv.at(i)
ps.set_index_parameters(index, opt.key)
print(opt.key.ljust(maxw), end=' ')
sys.stdout.flush()
res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
res.search_results[opt.key] = res_i
else:
print(header)
for param in parametersets:
print("%-40s " % param, end=' ')
sys.stdout.flush()
ps.set_index_parameters(index, param)
res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
res.search_results[param] = res_i
######################################################
# Driver function
######################################################
def main():
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('general options')
aa('--nthreads', default=-1, type=int,
help='nb of threads to use at train and add time')
aa('--json', default=False, action="store_true",
help="output stats in JSON format at the end")
aa('--todo', default=["check_files"],
choices=["train", "add", "search", "check_files"],
nargs="+", help='what to do (check_files means decide depending on which index files exist)')
group = parser.add_argument_group('dataset options')
aa('--db', default='deep1M', help='dataset')
aa('--compute_gt', default=False, action='store_true',
help='compute and store the groundtruth')
aa('--force_IP', default=False, action="store_true",
help='force IP search instead of L2')
aa('--accept_short_gt', default=False, action='store_true',
help='work around a problem with Deep1B GT')
group = parser.add_argument_group('index construction')
aa('--indexkey', default='HNSW32', help='index_factory type')
aa('--trained_indexfile', default='',
help='file to read or write a trained index from')
aa('--maxtrain', default=256 * 256, type=int,
help='maximum number of training points (0 to set automatically)')
aa('--indexfile', default='', help='file to read or write index from')
aa('--split', default=[1, 0], type=int, nargs=2, help="database split")
aa('--add_bs', default=-1, type=int,
help='add elements index by batches of this size')
group = parser.add_argument_group('IVF options')
aa('--by_residual', default=-1, type=int,
help="set if index should use residuals (default=unchanged)")
aa('--no_precomputed_tables', action='store_true', default=False,
help='disable precomputed tables (uses less memory)')
aa('--get_centroids_from', default='',
help='get the centroids from this index (to speed up training)')
aa('--clustering_niter', default=-1, type=int,
help='number of clustering iterations (-1 = leave default)')
aa('--train_on_gpu', default=False, action='store_true',
help='do training on GPU')
group = parser.add_argument_group('index-specific options')
aa('--M0', default=-1, type=int, help='size of base level for HNSW')
aa('--RQ_train_default', default=False, action="store_true",
help='disable progressive dim training for RQ')
aa('--RQ_beam_size', default=-1, type=int,
help='set beam size at add time')
aa('--LSQ_encode_ils_iters', default=-1, type=int,
help='ILS iterations for LSQ')
aa('--RQ_use_beam_LUT', default=-1, type=int,
help='use beam LUT at add time')
group = parser.add_argument_group('searching')
aa('--k', default=100, type=int, help='nb of nearest neighbors')
aa('--inter', default=False, action='store_true',
help='use intersection measure instead of 1-recall as metric')
aa('--searchthreads', default=-1, type=int,
help='nb of threads to use at search time')
aa('--searchparams', nargs='+', default=['autotune'],
help="search parameters to use (can be autotune or a list of params)")
aa('--n_autotune', default=500, type=int,
help="max nb of autotune experiments")
aa('--autotune_max', default=[], nargs='*',
help='set max value for autotune variables format "var:val" (exclusive)')
aa('--autotune_range', default=[], nargs='*',
help='set complete autotune range, format "var:val1,val2,..."')
aa('--min_test_duration', default=3.0, type=float,
help='run test at least for so long to avoid jitter')
aa('--indexes_to_merge', default=[], nargs="*",
help="load these indexes to search and merge them before searching")
args = parser.parse_args()
if args.todo == ["check_files"]:
if os.path.exists(args.indexfile):
args.todo = ["search"]
elif os.path.exists(args.trained_indexfile):
args.todo = ["add", "search"]
else:
args.todo = ["train", "add", "search"]
print("setting todo to", args.todo)
print("args:", args)
os.system('echo -n "nb processors "; '
'cat /proc/cpuinfo | grep ^processor | wc -l; '
'cat /proc/cpuinfo | grep ^"model name" | tail -1')
# object to collect results
res = argparse.Namespace()
res.args = args.__dict__
res.cpu_model = [
l for l in open("/proc/cpuinfo", "r")
if "model name" in l][0]
print("Load dataset")
ds = datasets.load_dataset(
dataset=args.db, compute_gt=args.compute_gt)
if args.force_IP:
ds.metric = "IP"
print(ds)
if args.nthreads != -1:
print("Set nb of threads to", args.nthreads)
faiss.omp_set_num_threads(args.nthreads)
else:
print("nb threads: ", faiss.omp_get_max_threads())
index = None
if "train" in args.todo:
print("================== Training index")
index = run_train(args, ds, res)
if args.trained_indexfile:
print("storing trained index", args.trained_indexfile)
faiss.write_index(index, args.trained_indexfile)
if "add" in args.todo:
if not index:
assert args.trained_indexfile
print("reading trained index", args.trained_indexfile)
index = faiss.read_index(args.trained_indexfile)
print("================== Adding vectors to index")
run_add(args, ds, index, res)
if args.indexfile:
print("storing", args.indexfile)
faiss.write_index(index, args.indexfile)
if "search" in args.todo:
if not index:
if args.indexfile:
print("reading index", args.indexfile)
index = faiss.read_index(args.indexfile)
elif args.indexes_to_merge:
print(f"Merging {len(args.indexes_to_merge)} indexes")
sz = 0
for fname in args.indexes_to_merge:
print(f" reading {fname} (current size {sz})")
index_i = faiss.read_index(fname)
if index is None:
index = index_i
else:
index.merge_from(index_i, index.ntotal)
sz = index.ntotal
else:
assert False, "provide --indexfile"
print("================== Searching")
run_search(args, ds, index, res)
if args.json:
print("JSON results:", json.dumps(res.__dict__))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,116 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import faiss
import argparse
import datasets
from datasets import sanitize
######################################################
# Command-line parsing
######################################################
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('dataset options')
aa('--db', default='deep1M', help='dataset')
aa('--nt', default=65536, type=int)
aa('--nb', default=100000, type=int)
aa('--nt_sample', default=0, type=int)
group = parser.add_argument_group('kmeans options')
aa('--k', default=256, type=int)
aa('--seed', default=12345, type=int)
aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
aa('--niter', default=25, type=int)
aa('--eval_freq', default=100, type=int)
args = parser.parse_args()
print("args:", args)
os.system('echo -n "nb processors "; '
'cat /proc/cpuinfo | grep ^processor | wc -l; '
'cat /proc/cpuinfo | grep ^"model name" | tail -1')
ngpu = faiss.get_num_gpus()
print("nb GPUs:", ngpu)
######################################################
# Load dataset
######################################################
xt, xb, xq, gt = datasets.load_data(dataset=args.db)
if args.nt_sample == 0:
xt_pca = xt[args.nt:args.nt + 10000]
xt = xt[:args.nt]
else:
xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
rs = np.random.RandomState(args.seed)
idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
xt = xt[idx]
xb = xb[:args.nb]
d = xb.shape[1]
if args.pcadim != -1:
print("training PCA: %d -> %d" % (d, args.pcadim))
pca = faiss.PCAMatrix(d, args.pcadim)
pca.train(sanitize(xt_pca))
xt = pca.apply_py(sanitize(xt))
xb = pca.apply_py(sanitize(xb))
d = xb.shape[1]
######################################################
# Run clustering
######################################################
index = faiss.IndexFlatL2(d)
if ngpu > 0:
print("moving index to GPU")
index = faiss.index_cpu_to_all_gpus(index)
clustering = faiss.Clustering(d, args.k)
clustering.verbose = True
clustering.seed = args.seed
clustering.max_points_per_centroid = 10**6
clustering.min_points_per_centroid = 1
centroids = None
for iter0 in range(0, args.niter, args.eval_freq):
iter1 = min(args.niter, iter0 + args.eval_freq)
clustering.niter = iter1 - iter0
if iter0 > 0:
faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
clustering.train(sanitize(xt), index)
index.reset()
centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
index.add(centroids)
_, I = index.search(sanitize(xb), 1)
error = ((xb - centroids[I.ravel()]) ** 2).sum()
print("iter1=%d quantization error on test: %.4f" % (iter1, error))

View File

@@ -0,0 +1,307 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import sys
import os
import argparse
import numpy as np
def eval_recalls(name, I, gt, times):
k = I.shape[1]
s = "%-40s recall" % name
nq = len(gt)
for rank in 1, 10, 100, 1000:
if rank > k:
break
recall = (I[:, :rank] == gt[:, :1]).sum() / nq
s += "@%d: %.4f " % (rank, recall)
s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
print(s)
def eval_inters(name, I, gt, times):
k = I.shape[1]
s = "%-40s inter" % name
nq = len(gt)
for rank in 1, 10, 100, 1000:
if rank > k:
break
ninter = 0
for i in range(nq):
ninter += np.intersect1d(I[i, :rank], gt[i, :rank]).size
inter = ninter / (nq * rank)
s += "@%d: %.4f " % (rank, inter)
s += "time: %.4f s (± %.4f)" % (np.mean(times), np.std(times))
print(s)
def main():
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('dataset options')
aa('--db', default='deep1M', help='dataset')
aa('--measure', default="1-recall",
help="perf measure to use: 1-recall or inter")
aa('--download', default=False, action="store_true")
aa('--lib', default='faiss', help='library to use (faiss or scann)')
aa('--thenscann', default=False, action="store_true")
aa('--base_dir', default='/checkpoint/matthijs/faiss_improvements/cmp_ivf_scan_2')
group = parser.add_argument_group('searching')
aa('--k', default=10, type=int, help='nb of nearest neighbors')
aa('--pre_reorder_k', default="0,10,100,1000", help='values for reorder_k')
aa('--nprobe', default="1,2,5,10,20,50,100,200", help='values for nprobe')
aa('--nrun', default=5, type=int, help='nb of runs to perform')
args = parser.parse_args()
print("args:", args)
pre_reorder_k_tab = [int(x) for x in args.pre_reorder_k.split(',')]
nprobe_tab = [int(x) for x in args.nprobe.split(',')]
os.system('echo -n "nb processors "; '
'cat /proc/cpuinfo | grep ^processor | wc -l; '
'cat /proc/cpuinfo | grep ^"model name" | tail -1')
cache_dir = args.base_dir + "/" + args.db + "/"
k = args.k
nrun = args.nrun
if not os.path.exists(cache_dir + "xb.npy"):
# prepare cache
from datasets import load_dataset
ds = load_dataset(args.db, download=args.download)
print(ds)
# store for SCANN
os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
tosave = dict(
xb = ds.get_database(),
xq = ds.get_queries(),
gt = ds.get_groundtruth()
)
for name, v in tosave.items():
fname = cache_dir + "/" + name + ".npy"
print("save", fname)
np.save(fname, v)
open(cache_dir + "metric", "w").write(ds.metric)
dataset = {}
for kn in "xb xq gt".split():
fname = cache_dir + "/" + kn + ".npy"
print("load", fname)
dataset[kn] = np.load(fname)
xb = dataset["xb"]
xq = dataset["xq"]
gt = dataset["gt"]
distance_measure = open(cache_dir + "metric").read()
if args.lib == "faiss":
import faiss
name1_to_metric = {
"IP": faiss.METRIC_INNER_PRODUCT,
"L2": faiss.METRIC_L2
}
index_fname = cache_dir + "index.faiss"
if not os.path.exists(index_fname):
index = faiss_make_index(
xb, name1_to_metric[distance_measure], index_fname)
else:
index = faiss.read_index(index_fname)
faiss_eval_search(
index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
nrun, args.measure
)
if args.lib == "scann":
from scann.scann_ops.py import scann_ops_pybind
name1_to_name2 = {
"IP": "dot_product",
"L2": "squared_l2"
}
scann_dir = cache_dir + "/scann1.1.1_serialized"
if os.path.exists(scann_dir + "/scann_config.pb"):
searcher = scann_ops_pybind.load_searcher(scann_dir)
else:
searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
if os.path.exists(scann_dir + "/scann_config.pb"):
searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
else:
searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
scann_eval_search(
searcher, searcher_reo,
xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
nrun, args.measure
)
if args.lib != "scann" and args.thenscann:
# just append --lib scann, that will override the previous cmdline
# options
cmdline = " ".join(sys.argv) + " --lib scann"
cmdline = (
". ~/anaconda3/etc/profile.d/conda.sh ; " +
"conda activate scann_1.1.1; "
"python -u " + cmdline)
print("running", cmdline)
os.system(cmdline)
###############################################################
# SCANN
###############################################################
def scann_make_index(xb, distance_measure, scann_dir, reorder_k):
import scann
print("build index")
if distance_measure == "dot_product":
thr = 0.2
else:
thr = 0
k = 10
sb = scann.scann_ops_pybind.builder(xb, k, distance_measure)
sb = sb.tree(num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000)
sb = sb.score_ah(2, anisotropic_quantization_threshold=thr)
if reorder_k > 0:
sb = sb.reorder(reorder_k)
searcher = sb.build()
print("done")
print("write index to", scann_dir)
os.system(f"rm -rf {scann_dir}; mkdir -p {scann_dir}")
# os.mkdir(scann_dir)
searcher.serialize(scann_dir)
return searcher
def scann_eval_search(
searcher, searcher_reo,
xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
nrun, measure):
# warmup
for _run in range(5):
searcher.search_batched(xq)
for nprobe in nprobe_tab:
for pre_reorder_k in pre_reorder_k_tab:
times = []
for _run in range(nrun):
if pre_reorder_k == 0:
t0 = time.time()
I, D = searcher.search_batched(
xq, leaves_to_search=nprobe, final_num_neighbors=k
)
t1 = time.time()
else:
t0 = time.time()
I, D = searcher_reo.search_batched(
xq, leaves_to_search=nprobe, final_num_neighbors=k,
pre_reorder_num_neighbors=pre_reorder_k
)
t1 = time.time()
times.append(t1 - t0)
header = "SCANN nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
if measure == "1-recall":
eval_recalls(header, I, gt, times)
else:
eval_inters(header, I, gt, times)
###############################################################
# Faiss
###############################################################
def faiss_make_index(xb, metric_type, fname):
import faiss
d = xb.shape[1]
M = d // 2
index = faiss.index_factory(d, f"IVF2000,PQ{M}x4fs", metric_type)
# if not by_residual:
# print("setting no residual")
# index.by_residual = False
print("train")
index.train(xb[:250000])
print("add")
index.add(xb)
print("write index", fname)
faiss.write_index(index, fname)
return index
def faiss_eval_search(
index, xq, xb, nprobe_tab, pre_reorder_k_tab,
k, gt, nrun, measure
):
import faiss
print("use precomputed table=", index.use_precomputed_table,
"by residual=", index.by_residual)
print("adding a refine index")
index_refine = faiss.IndexRefineFlat(index, faiss.swig_ptr(xb))
print("set single thread")
faiss.omp_set_num_threads(1)
print("warmup")
for _run in range(5):
index.search(xq, k)
print("run timing")
for nprobe in nprobe_tab:
for pre_reorder_k in pre_reorder_k_tab:
index.nprobe = nprobe
times = []
for _run in range(nrun):
if pre_reorder_k == 0:
t0 = time.time()
D, I = index.search(xq, k)
t1 = time.time()
else:
index_refine.k_factor = pre_reorder_k / k
t0 = time.time()
D, I = index_refine.search(xq, k)
t1 = time.time()
times.append(t1 - t0)
header = "Faiss nprobe=%4d reo=%4d" % (nprobe, pre_reorder_k)
if measure == "1-recall":
eval_recalls(header, I, gt, times)
else:
eval_inters(header, I, gt, times)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Common functions to load datasets and compute their ground-truth
"""
import time
import numpy as np
import faiss
from faiss.contrib import datasets as faiss_datasets
print("path:", faiss_datasets.__file__)
faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
def sanitize(x):
return np.ascontiguousarray(x, dtype='float32')
#################################################################
# Dataset
#################################################################
class DatasetCentroids(faiss_datasets.Dataset):
def __init__(self, ds, indexfile):
self.d = ds.d
self.metric = ds.metric
self.nq = ds.nq
self.xq = ds.get_queries()
# get the xb set
src_index = faiss.read_index(indexfile)
src_quant = faiss.downcast_index(src_index.quantizer)
centroids = faiss.vector_to_array(src_quant.xb)
self.xb = centroids.reshape(-1, self.d)
self.nb = self.nt = len(self.xb)
def get_queries(self):
return self.xq
def get_database(self):
return self.xb
def get_train(self, maxtrain=None):
return self.xb
def get_groundtruth(self, k=100):
return faiss.knn(
self.xq, self.xb, k,
faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
)[1]
def load_dataset(dataset='deep1M', compute_gt=False, download=False):
print("load data", dataset)
if dataset == 'sift1M':
return faiss_datasets.DatasetSIFT1M()
elif dataset.startswith('bigann'):
dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
return faiss_datasets.DatasetBigANN(nb_M=dbsize)
elif dataset.startswith("deep_centroids_"):
ncent = int(dataset[len("deep_centroids_"):])
centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
return DatasetCentroids(
faiss_datasets.DatasetDeep1B(nb=1000000),
f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
)
elif dataset.startswith("deep"):
szsuf = dataset[4:]
if szsuf[-1] == 'M':
dbsize = 10 ** 6 * int(szsuf[:-1])
elif szsuf == '1B':
dbsize = 10 ** 9
elif szsuf[-1] == 'k':
dbsize = 1000 * int(szsuf[:-1])
else:
assert False, "did not recognize suffix " + szsuf
return faiss_datasets.DatasetDeep1B(nb=dbsize)
elif dataset == "music-100":
return faiss_datasets.DatasetMusic100()
elif dataset == "glove":
return faiss_datasets.DatasetGlove(download=download)
else:
assert False
#################################################################
# Evaluation
#################################################################
def evaluate_DI(D, I, gt):
nq = gt.shape[0]
k = I.shape[1]
rank = 1
while rank <= k:
recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
print("R@%d: %.4f" % (rank, recall), end=' ')
rank *= 10
def evaluate(xq, gt, index, k=100, endl=True):
t0 = time.time()
D, I = index.search(xq, k)
t1 = time.time()
nq = xq.shape[0]
print("\t %8.4f ms per query, " % (
(t1 - t0) * 1000.0 / nq), end=' ')
rank = 1
while rank <= k:
recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
print("R@%d: %.4f" % (rank, recall), end=' ')
rank *= 10
if endl:
print()
return D, I

View File

@@ -0,0 +1,27 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
# https://stackoverflow.com/questions/7016056/python-logging-not-outputting-anything
logging.basicConfig()
logger = logging.getLogger('faiss.contrib.exhaustive_search')
logger.setLevel(logging.INFO)
from faiss.contrib import datasets
from faiss.contrib.exhaustive_search import knn_ground_truth
from faiss.contrib import vecs_io
ds = datasets.DatasetDeep1B(nb=int(1e9))
print("computing GT matches for", ds)
D, I = knn_ground_truth(
ds.get_queries(),
ds.database_iterator(bs=65536),
k=100
)
vecs_io.ivecs_write("/tmp/tt.ivecs", I)

View File

@@ -0,0 +1,502 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
from collections import defaultdict
from matplotlib import pyplot
import re
from argparse import Namespace
from faiss.contrib.factory_tools import get_code_size as unitsize
def dbsize_from_name(dbname):
sufs = {
'1B': 10**9,
'100M': 10**8,
'10M': 10**7,
'1M': 10**6,
}
for s in sufs:
if dbname.endswith(s):
return sufs[s]
else:
assert False
def keep_latest_stdout(fnames):
fnames = [fname for fname in fnames if fname.endswith('.stdout')]
fnames.sort()
n = len(fnames)
fnames2 = []
for i, fname in enumerate(fnames):
if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
continue
fnames2.append(fname)
return fnames2
def parse_result_file(fname):
# print fname
st = 0
res = []
keys = []
stats = {}
stats['run_version'] = fname[-8]
indexkey = None
for l in open(fname):
if l.startswith("srun:"):
# looks like a crash...
if indexkey is None:
raise RuntimeError("instant crash")
break
elif st == 0:
if l.startswith("dataset in dimension"):
fi = l.split()
stats["d"] = int(fi[3][:-1])
stats["nq"] = int(fi[9])
stats["nb"] = int(fi[11])
stats["nt"] = int(fi[13])
if l.startswith('index size on disk:'):
stats['index_size'] = int(l.split()[-1])
if l.startswith('current RSS:'):
stats['RSS'] = int(l.split()[-1])
if l.startswith('precomputed tables size:'):
stats['tables_size'] = int(l.split()[-1])
if l.startswith('Setting nb of threads to'):
stats['n_threads'] = int(l.split()[-1])
if l.startswith(' add in'):
stats['add_time'] = float(l.split()[-2])
if l.startswith("vector code_size"):
stats['code_size'] = float(l.split()[-1])
if l.startswith('args:'):
args = eval(l[l.find(' '):])
indexkey = args.indexkey
elif "time(ms/q)" in l:
# result header
if 'R@1 R@10 R@100' in l:
stats["measure"] = "recall"
stats["ranks"] = [1, 10, 100]
elif 'I@1 I@10 I@100' in l:
stats["measure"] = "inter"
stats["ranks"] = [1, 10, 100]
elif 'inter@' in l:
stats["measure"] = "inter"
fi = l.split()
if fi[1] == "inter@":
rank = int(fi[2])
else:
rank = int(fi[1][len("inter@"):])
stats["ranks"] = [rank]
else:
assert False
st = 1
elif 'index size on disk:' in l:
stats["index_size"] = int(l.split()[-1])
elif st == 1:
st = 2
elif st == 2:
fi = l.split()
if l[0] == " ":
# means there are 0 parameters
fi = [""] + fi
keys.append(fi[0])
res.append([float(x) for x in fi[1:]])
return indexkey, np.array(res), keys, stats
# the directory used in run_on_cluster.bash
basedir = "/checkpoint/matthijs/bench_all_ivf/"
logdir = basedir + 'logs/'
def collect_results_for(db='deep1M', prefix="autotune."):
# run parsing
allres = {}
allstats = {}
missing = []
fnames = keep_latest_stdout(os.listdir(logdir))
# print fnames
# filenames are in the form <key>.x.stdout
# where x is a version number (from a to z)
# keep only latest version of each name
for fname in fnames:
if not (
'db' + db in fname and
fname.startswith(prefix) and
fname.endswith('.stdout')
):
continue
print("parse", fname, end=" ", flush=True)
try:
indexkey, res, _, stats = parse_result_file(logdir + fname)
except RuntimeError as e:
print("FAIL %s" % e)
res = np.zeros((2, 0))
except Exception as e:
print("PARSE ERROR " + e)
res = np.zeros((2, 0))
else:
print(len(res), "results")
if res.size == 0:
missing.append(fname)
else:
if indexkey in allres:
if allstats[indexkey]['run_version'] > stats['run_version']:
# don't use this run
continue
allres[indexkey] = res
allstats[indexkey] = stats
return allres, allstats
def extract_pareto_optimal(allres, keys, recall_idx=0, times_idx=3):
bigtab = []
for i, k in enumerate(keys):
v = allres[k]
perf = v[:, recall_idx]
times = v[:, times_idx]
bigtab.append(
np.vstack((
np.ones(times.size) * i,
perf, times
))
)
if bigtab == []:
return [], np.zeros((3, 0))
bigtab = np.hstack(bigtab)
# sort by perf
perm = np.argsort(bigtab[1, :])
bigtab_sorted = bigtab[:, perm]
best_times = np.minimum.accumulate(bigtab_sorted[2, ::-1])[::-1]
selection, = np.where(bigtab_sorted[2, :] == best_times)
selected_keys = [
keys[i] for i in
np.unique(bigtab_sorted[0, selection].astype(int))
]
ops = bigtab_sorted[:, selection]
return selected_keys, ops
def plot_subset(
allres, allstats, selected_methods, recall_idx, times_idx=3,
report=["overhead", "build time"]):
# important methods
for k in selected_methods:
v = allres[k]
stats = allstats[k]
d = stats["d"]
dbsize = stats["nb"]
if "index_size" in stats and "tables_size" in stats:
tot_size = stats['index_size'] + stats['tables_size']
else:
tot_size = -1
id_size = 8 # 64 bit
addt = ''
if 'add_time' in stats:
add_time = stats['add_time']
if add_time > 7200:
add_min = add_time / 60
addt = ', %dh%02d' % (add_min / 60, add_min % 60)
else:
add_sec = int(add_time)
addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
code_size = unitsize(d, k)
label = k
if "code_size" in report:
label += " %d bytes" % code_size
tight_size = (code_size + id_size) * dbsize
if tot_size < 0 or "overhead" not in report:
pass # don't know what the index size is
elif tot_size > 10 * tight_size:
label += " overhead x%.1f" % (tot_size / tight_size)
else:
label += " overhead+%.1f%%" % (
tot_size / tight_size * 100 - 100)
if "build time" in report:
label += " " + addt
linestyle = (':' if 'Refine' in k or 'RFlat' in k else
'-.' if 'SQ' in k else
'-' if '4fs' in k else
'-')
print(k, linestyle)
pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=label,
linestyle=linestyle,
marker='o' if '4fs' in k else '+')
recall_rank = stats["ranks"][recall_idx]
if stats["measure"] == "recall":
pyplot.xlabel('1-recall at %d' % recall_rank)
elif stats["measure"] == "inter":
pyplot.xlabel('inter @ %d' % recall_rank)
else:
assert False
pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])
def plot_tradeoffs(db, allres, allstats, code_size, recall_rank):
stat0 = next(iter(allstats.values()))
d = stat0["d"]
n_threads = stat0["n_threads"]
recall_idx = stat0["ranks"].index(recall_rank)
# times come after the perf measure
times_idx = len(stat0["ranks"])
if type(code_size) == int:
if code_size == 0:
code_size = [0, 1e50]
code_size_name = "any code size"
else:
code_size_name = "code_size=%d" % code_size
code_size = [code_size, code_size]
elif type(code_size) == tuple:
code_size_name = "code_size in [%d, %d]" % code_size
else:
assert False
names_maxperf = []
for k in sorted(allres):
v = allres[k]
if v.ndim != 2: continue
us = unitsize(d, k)
if not code_size[0] <= us <= code_size[1]: continue
names_maxperf.append((v[-1, recall_idx], k))
# sort from lowest to highest topline accuracy
names_maxperf.sort()
names = [name for mp, name in names_maxperf]
selected_methods, optimal_points = \
extract_pareto_optimal(allres, names, recall_idx, times_idx)
not_selected = list(set(names) - set(selected_methods))
print("methods without an optimal OP: ", not_selected)
pyplot.title('database ' + db + ' ' + code_size_name)
# grayed out lines
for k in not_selected:
v = allres[k]
if v.ndim != 2: continue
us = unitsize(d, k)
if not code_size[0] <= us <= code_size[1]: continue
linestyle = (':' if 'PQ' in k else
'-.' if 'SQ4' in k else
'--' if 'SQ8' in k else '-')
pyplot.semilogy(v[:, recall_idx], 1000 / v[:, times_idx], label=None,
linestyle=linestyle,
marker='o' if 'HNSW' in k else '+',
color='#cccccc', linewidth=0.2)
plot_subset(allres, allstats, selected_methods, recall_idx, times_idx)
if len(not_selected) == 0:
om = ''
else:
om = '\nomitted:'
nc = len(om)
for m in not_selected:
if nc > 80:
om += '\n'
nc = 0
om += ' ' + m
nc += len(m) + 1
# pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s")
# print(optimal_points[0, :])
pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
pyplot.ylabel('QPS (%d threads)' % n_threads)
pyplot.legend()
pyplot.grid()
return selected_methods, not_selected
if __name__ == "__main__xx":
# tests on centroids indexing (v1)
for k in 1, 32, 128:
pyplot.gcf().set_size_inches(15, 10)
i = 1
for ncent in 65536, 262144, 1048576, 4194304:
db = f'deep_centroids_{ncent}.k{k}.'
allres, allstats = collect_results_for(
db=db, prefix="cent_index.")
pyplot.subplot(2, 2, i)
plot_subset(
allres, allstats, list(allres.keys()),
recall_idx=0,
times_idx=1,
report=["code_size"]
)
i += 1
pyplot.title(f"{ncent} centroids")
pyplot.legend()
pyplot.xlim([0.95, 1])
pyplot.grid()
pyplot.savefig('figs/deep1B_centroids_k%d.png' % k)
if __name__ == "__main__xx":
# centroids plot per k
pyplot.gcf().set_size_inches(15, 10)
i=1
for ncent in 65536, 262144, 1048576, 4194304:
xyd = defaultdict(list)
for k in 1, 4, 8, 16, 32, 64, 128, 256:
db = f'deep_centroids_{ncent}.k{k}.'
allres, allstats = collect_results_for(db=db, prefix="cent_index.")
for indexkey, res in allres.items():
idx, = np.where(res[:, 0] >= 0.99)
if idx.size > 0:
xyd[indexkey].append((k, 1000 / res[idx[0], 1]))
pyplot.subplot(2, 2, i)
i += 1
for indexkey, xy in xyd.items():
xy = np.array(xy)
pyplot.loglog(xy[:, 0], xy[:, 1], 'o-', label=indexkey)
pyplot.title(f"{ncent} centroids")
pyplot.xlabel("k")
xt = 2**np.arange(9)
pyplot.xticks(xt, ["%d" % x for x in xt])
pyplot.ylabel("QPS (32 threads)")
pyplot.legend()
pyplot.grid()
pyplot.savefig('../plots/deep1B_centroids_min99.png')
if __name__ == "__main__xx":
# main indexing plots
i = 0
for db in 'bigann10M', 'deep10M', 'bigann100M', 'deep100M', 'deep1B', 'bigann1B':
allres, allstats = collect_results_for(
db=db, prefix="autotune.")
for cs in 8, 16, 32, 64:
pyplot.figure(i)
i += 1
pyplot.gcf().set_size_inches(15, 10)
cs_range = (
(0, 8) if cs == 8 else (cs // 2 + 1, cs)
)
plot_tradeoffs(
db, allres, allstats, code_size=cs_range, recall_rank=1)
pyplot.savefig('../plots/tradeoffs_%s_cs%d_r1.png' % (
db, cs))
if __name__ == "__main__":
# 1M indexes
i = 0
for db in "glove", "music-100":
pyplot.figure(i)
pyplot.gcf().set_size_inches(15, 10)
i += 1
allres, allstats = collect_results_for(db=db, prefix="autotune.")
plot_tradeoffs(db, allres, allstats, code_size=0, recall_rank=1)
pyplot.savefig('../plots/1M_tradeoffs_' + db + ".png")
for db in "sift1M", "deep1M":
allres, allstats = collect_results_for(db=db, prefix="autotune.")
pyplot.figure(i)
pyplot.gcf().set_size_inches(15, 10)
i += 1
plot_tradeoffs(db, allres, allstats, code_size=(0, 64), recall_rank=1)
pyplot.savefig('../plots/1M_tradeoffs_' + db + "_small.png")
pyplot.figure(i)
pyplot.gcf().set_size_inches(15, 10)
i += 1
plot_tradeoffs(db, allres, allstats, code_size=(65, 10000), recall_rank=1)
pyplot.savefig('../plots/1M_tradeoffs_' + db + "_large.png")
if __name__ == "__main__xx":
db = 'sift1M'
allres, allstats = collect_results_for(db=db, prefix="autotune.")
pyplot.gcf().set_size_inches(15, 10)
keys = [
"IVF1024,PQ32x8",
"IVF1024,PQ64x4",
"IVF1024,PQ64x4fs",
"IVF1024,PQ64x4fsr",
"IVF1024,SQ4",
"IVF1024,SQ8"
]
plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
pyplot.legend()
pyplot.title(db)
pyplot.xlabel("1-recall@1")
pyplot.ylabel("QPS (32 threads)")
pyplot.grid()
pyplot.savefig('../plots/ivf1024_variants.png')
pyplot.figure(2)
pyplot.gcf().set_size_inches(15, 10)
keys = [
"HNSW32",
"IVF1024,PQ64x4fs",
"IVF1024,PQ64x4fsr",
"IVF1024,PQ64x4fs,RFlat",
"IVF1024,PQ64x4fs,Refine(SQfp16)",
"IVF1024,PQ64x4fs,Refine(SQ8)",
]
plot_subset(allres, allstats, keys, recall_idx=0, report=["code_size"])
pyplot.legend()
pyplot.title(db)
pyplot.xlabel("1-recall@1")
pyplot.ylabel("QPS (32 threads)")
pyplot.grid()
pyplot.savefig('../plots/ivf1024_rerank.png')

View File

@@ -0,0 +1,603 @@
set -e
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# @nolint
# This script launches the experiments on a cluster
# It assumes two shell functions are defined:
#
# run_on_1machine: runs a command on one (full) machine on a cluster
#
# run_on_8gpu: runs a command on one machine with 8 GPUs
#
# the two functions are called as:
#
# run_on_1machine <name> <command>
#
# the stdout of the command should be stored in $logdir/<name>.stdout
function run_on ()
{
sys="$1"
shift
name="$1"
shift
script="$logdir/$name.sh"
if [ -e "$script" ]; then
echo script "$script" exists
return
fi
# srun handles special characters fine, but the shell interpreter
# does not
escaped_cmd=$( printf "%q " "$@" )
cat > $script <<EOF
#! /bin/bash
srun $escaped_cmd
EOF
echo -n "$logdir/$name.stdout "
sbatch -n1 -J "$name" \
$sys \
--comment='priority is the only one that works' \
--output="$logdir/$name.stdout" \
"$script"
}
function run_on_1machine {
run_on "--cpus-per-task=80 --gres=gpu:0 --mem=500G --time=70:00:00 --partition=priority" "$@"
}
function run_on_1machine_1h {
run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=1:00:00 --partition=priority" "$@"
}
function run_on_1machine_3h {
run_on "--cpus-per-task=80 --gres=gpu:2 --mem=100G --time=3:00:00 --partition=priority" "$@"
}
function run_on_4gpu_3h {
run_on "--cpus-per-task=40 --gres=gpu:4 --mem=100G --time=3:00:00 --partition=priority" "$@"
}
function run_on_8gpu () {
run_on "--cpus-per-task=80 --gres=gpu:8 --mem=100G --time=70:00:00 --partition=priority" "$@"
}
# prepare output directories
# set to some directory where all indexes, can be written.
basedir=/checkpoint/matthijs/bench_all_ivf
logdir=$basedir/logs
indexdir=$basedir/indexes
centdir=$basedir/precomputed_clusters
mkdir -p $logdir $indexdir
# adds an option to use a pretrained quantizer
function add_precomputed_quantizer () {
local db="$1"
local coarse="$2"
case $db in
bigann*) rname=bigann ;;
deep*) rname=deep ;;
sift1M) return;;
music-100) return ;;
glove) return ;;
*) echo "bad db"; exit 1;;
esac
case $coarse in
IVF65536*)
cname=clustering.db${rname}1M.IVF65536.faissindex
copt="--get_centroids_from $centdir/$cname"
;;
IVF262144*)
cname=clustering.db${rname}1M.IVF262144.faissindex
copt="--get_centroids_from $centdir/$cname"
;;
IVF1048576*)
cname=clustering.db${rname}1M.IVF1048576.faissindex
copt="--get_centroids_from $centdir/$cname"
;;
IVF4194304*)
cname=clustering.db${rname}1M.IVF4194304.faissindex
copt="--get_centroids_from $centdir/$cname"
;;
*)
copt="" ;;
esac
echo $copt
}
function get_db_dim () {
local db="$1"
case $db in
sift1M) dim=128;;
bigann*) dim=128;;
deep*) dim=96;;
music-100) dim=100;;
glove) dim=100;;
*) echo "bad db"; exit 1;;
esac
echo $dim
}
# replace HD = half dim with the half of the dimension we need to handle
# relying that variables are global by default...
function replace_coarse_PQHD () {
local coarse="$1"
local dim=$2
coarseD=${coarse//PQHD/PQ$((dim/2))}
coarse16=${coarse//PQHD/PQ8}
coarse32=${coarse//PQHD/PQ16}
coarse64=${coarse//PQHD/PQ32}
coarse128=${coarse//PQHD/PQ64}
coarse256=${coarse//PQHD/PQ128}
coarse112=${coarse//PQHD/PQ56}
}
if false; then
###############################################
# comparison with SCANN
for db in sift1M deep1M glove music-100
do
opt=""
if [ $db == glove ]; then
opt="--measure inter"
fi
run_on_1machine_1h cmp_with_scann.$db.c \
python -u cmp_with_scann.py --db $db \
--lib faiss $opt --thenscann
done
############################### Preliminary SIFT1M experiment
for db in sift1M ; do
for coarse in IVF1024
do
indexkeys="
HNSW32
$coarse,SQfp16
$coarse,SQ4
$coarse,SQ8
$coarse,PQ32x8
$coarse,PQ64x4
$coarse,PQ64x4fs
$coarse,PQ64x4fs,RFlat
$coarse,PQ64x4fs,Refine(SQfp16)
$coarse,PQ64x4fs,Refine(SQ8)
OPQ64,$coarse,PQ64x4fs
OPQ64,$coarse,PQ64x4fs,RFlat
"
indexkeys="
$coarse,PQ64x4fsr
$coarse,PQ64x4fsr,RFlat
"
# OPQ actually degrades the results on SIFT1M, so let's ignore
for indexkey in $indexkeys
do
# escape nasty characters
key="autotune.db$db.${indexkey//,/_}"
key="${key//(/_}"
key="${key//)/_}"
run_on_1machine_1h $key.a \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--indexfile $indexdir/$key.faissindex \
--searchthreads 32
done
done
done
############################### 1M experiments
fi
# for db in sift1M deep1M music-100 glove; do
for db in glove music-100; do
dim=$( get_db_dim $db )
for coarse in IVF1024 IVF4096_HNSW32
do
replace_coarse_PQHD "$coarse" $dim
indexkeys="
$coarseD,PQ$((dim/2))x4fs
$coarseD,PQ$((dim/2))x4fsr
OPQ8_64,$coarse64,PQ8
PCAR16,$coarse16,SQ4
OPQ16_64,$coarse64,PQ16x4fs
OPQ16_64,$coarse64,PQ16x4fsr
OPQ16_64,$coarse64,PQ16
PCAR16,$coarse16,SQ8
PCAR32,$coarse32,SQ4
OPQ32_64,$coarse64,PQ32x4fs
OPQ32_64,$coarse64,PQ32x4fsr
OPQ32_128,$coarse128,PQ32
PCAR32,$coarse32,SQ8
PCAR64,$coarse64,SQ4
PCAR16,$coarse16,SQfp16
OPQ64_128,$coarse128,PQ64x4fs
OPQ64_128,$coarse128,PQ64x4fsr
OPQ64_128,$coarse128,PQ64
PCAR64,$coarse64,SQ8
PCAR32,$coarse32,SQfp16
PCAR128,$coarse128,SQ4
OPQ128_256,$coarse256,PQ128x4fs
OPQ128_256,$coarse256,PQ128x4fsr
OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
OPQ64_128,$coarse,PQ64x12
OPQ64_128,$coarse,PQ64x4fs,RFlat
OPQ64_128,$coarse,PQ64x4fs,Refine(SQfp16)
OPQ64_128,$coarse,PQ64x4fs,Refine(SQ8)
OPQ64_128,$coarse,PQ64x4fs,Refine(SQ6)
OPQ64_128,$coarse,PQ64x4fs,Refine(SQ4)
OPQ32_64,$coarse,PQ32x4fs,Refine(SQfp16)
OPQ32_64,$coarse,PQ32x4fs,Refine(SQ8)
OPQ32_64,$coarse,PQ32x4fs,Refine(SQ6)
OPQ32_64,$coarse,PQ32x4fs,Refine(SQ4)
"
indexkeys="
$coarseD,PQ$((dim/2))x4fs
$coarseD,PQ$((dim/2))x4fsr
$coarseD,PQ$((dim/2))x4fsr,RFlat
$coarseD,PQ$((dim/2))x4fsr,Refine(SQfp16)
$coarseD,PQ$((dim/2))x4fsr,Refine(SQ8)
$coarseD,PQ$((dim/4))x4fs
$coarseD,PQ$((dim/4))x4fsr
$coarseD,PQ$((dim/4))x4fsr,RFlat
$coarseD,PQ$((dim/4))x4fsr,Refine(SQfp16)
$coarseD,PQ$((dim/4))x4fsr,Refine(SQ8)
$coarseD,PQ$((dim/2))
$coarseD,PQ$((dim/4))
HNSW32,Flat
"
indexkeys="HNSW32,Flat"
for indexkey in $indexkeys
do
key=autotune.db$db.${indexkey//,/_}
key="${key//(/_}"
key="${key//)/_}"
run_on_1machine_3h $key.q \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--indexfile "$indexdir/$key.faissindex" \
$( add_precomputed_quantizer $db $coarse ) \
--searchthreads 32 \
--min_test_duration 3
done
done
done
if false; then
############################################
# precompute centroids on GPU for large vocabularies
for db in deep1M bigann1M; do
for ncent in 262144 65536 1048576 4194304; do
key=clustering.db$db.IVF$ncent
run_on_4gpu_3h $key.e \
python -u bench_all_ivf.py \
--db $db \
--indexkey IVF$ncent,SQ8 \
--maxtrain 100000000 \
--indexfile $centdir/$key.faissindex \
--searchthreads 32 \
--min_test_duration 3 \
--add_bs 1000000 \
--train_on_gpu
done
done
###############################
## coarse quantizer experiments on the centroids of deep1B
for k in 4 8 16 64 256; do
for ncent in 65536 262144 1048576 4194304; do
db=deep_centroids_$ncent
# compute square root of ncent...
for(( ls=0; ncent > (1 << (2 * ls)); ls++)); do
echo -n
done
sncent=$(( 1 << ls ))
indexkeys="
IVF$((sncent/2)),PQ48x4fs,RFlat
IVF$((sncent*2)),PQ48x4fs,RFlat
HNSW32
PQ48x4fs
PQ48x4fs,RFlat
IVF$sncent,PQ48x4fs,RFlat
"
for indexkey in $indexkeys; do
key="cent_index.db$db.k$k.$indexkey"
run_on_1machine_1h "$key.b" \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--inter \
--searchthreads 32 \
--k $k
done
done
done
############################### 10M experiments
for db in deep10M bigann10M; do
coarses="
IVF65536(IVF256,PQHDx4fs,RFlat)
IVF16384_HNSW32
IVF65536_HNSW32
IVF262144_HNSW32
IVF262144(IVF512,PQHDx4fs,RFlat)
"
dim=$( get_db_dim $db )
for coarse in $coarses
do
replace_coarse_PQHD "$coarse" $dim
indexkeys="
$coarseD,PQ$((dim/2))x4fs
OPQ8_64,$coarse64,PQ8
PCAR16,$coarse16,SQ4
OPQ16_64,$coarse64,PQ16x4fs
OPQ16_64,$coarse64,PQ16x4fsr
OPQ16_64,$coarse64,PQ16
PCAR16,$coarse16,SQ8
PCAR32,$coarse32,SQ4
OPQ32_64,$coarse64,PQ32x4fs
OPQ32_64,$coarse64,PQ32x4fsr
OPQ32_128,$coarse128,PQ32
PCAR32,$coarse32,SQ8
PCAR64,$coarse64,SQ4
PCAR16,$coarse16,SQfp16
OPQ64_128,$coarse128,PQ64x4fs
OPQ64_128,$coarse128,PQ64x4fsr
OPQ64_128,$coarse128,PQ64
PCAR64,$coarse64,SQ8
PCAR32,$coarse32,SQfp16
PCAR128,$coarse128,SQ4
OPQ128_256,$coarse256,PQ128x4fs
OPQ128_256,$coarse256,PQ128x4fsr
OPQ56_112,$coarse112,PQ7+56
OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
OPQ16_64,$coarse64,PQ16x4fs,Refine(PCAR72,SQ6)
OPQ32_64,$coarse64,PQ16x4fs,Refine(PCAR64,SQ6)
OPQ32_64,$coarse64,PQ32x4fs,Refine(OPQ48_96,PQ48)
"
indexkeys="
OPQ16_64,$coarse64,PQ16x4fsr
OPQ32_64,$coarse64,PQ32x4fsr
OPQ64_128,$coarse128,PQ64x4fsr
OPQ128_256,$coarse256,PQ128x4fsr
"
for indexkey in $indexkeys
do
key=autotune.db$db.${indexkey//,/_}
key="${key//(/_}"
key="${key//)/_}"
run_on_1machine_3h $key.l \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--indexfile "$indexdir/$key.faissindex" \
$( add_precomputed_quantizer $db $coarse ) \
--searchthreads 32 \
--min_test_duration 3 \
--autotune_max nprobe:2000
done
done
done
############################### 100M experiments
for db in deep100M bigann100M; do
coarses="
IVF65536_HNSW32
IVF262144_HNSW32
IVF262144(IVF512,PQHDx4fs,RFlat)
IVF1048576_HNSW32
IVF1048576(IVF1024,PQHDx4fs,RFlat)
"
dim=$( get_db_dim $db )
for coarse in $coarses
do
replace_coarse_PQHD "$coarse" $dim
indexkeys="
OPQ8_64,$coarse64,PQ8
OPQ16_64,$coarse64,PQ16x4fs
PCAR32,$coarse32,SQ4
OPQ16_64,$coarse64,PQ16
OPQ32_64,$coarse64,PQ32x4fs
OPQ32_128,$coarse128,PQ32
PCAR64,$coarse64,SQ4
PCAR32,$coarse32,SQ8
OPQ64_128,$coarse128,PQ64x4fs
PCAR128,$coarse128,SQ4
OPQ64_128,$coarse128,PQ64
PCAR32,$coarse32,SQfp16
PCAR64,$coarse64,SQ8
OPQ128_256,$coarse256,PQ128x4fs
OPQ56_112,$coarse112,PQ7+56
OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
$coarseD,PQ$((dim/2))x4fs
"
indexkeys="
OPQ128_256,$coarse256,PQ128x4fsr
OPQ64_128,$coarse128,PQ64x4fsr
OPQ32_64,$coarse64,PQ32x4fsr
OPQ16_64,$coarse64,PQ16x4fsr
OPQ16_64,$coarse64,PQ16x4fsr,Refine(OPQ56_112,PQ56)
"
for indexkey in $indexkeys
do
key=autotune.db$db.${indexkey//,/_}
key="${key//(/_}"
key="${key//)/_}"
run_on_1machine $key.e \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--indexfile $indexdir/$key.faissindex \
--searchthreads 32 \
--min_test_duration 3 \
$( add_precomputed_quantizer $db $coarse ) \
--add_bs 1000000 \
--autotune_max nprobe:2000
done
done
done
#################################
# 1B-scale experiment
for db in deep1B bigann1B; do
coarses="
IVF1048576_HNSW32
IVF4194304_HNSW32
IVF4194304(IVF1024,PQHDx4fs,RFlat)
"
dim=$( get_db_dim $db )
for coarse in $coarses; do
replace_coarse_PQHD "$coarse" $dim
indexkeys="
OPQ8_64,$coarse64,PQ8
OPQ16_64,$coarse64,PQ16x4fsr
OPQ16_64,$coarse64,PQ16
OPQ32_64,$coarse64,PQ32x4fsr
OPQ32_128,$coarse128,PQ32
OPQ64_128,$coarse128,PQ64x4fsr
OPQ64_128,$coarse128,PQ64
OPQ128_256,$coarse256,PQ128x4fsr
OPQ56_112,$coarse112,PQ7+56
OPQ16_64,$coarse64,PQ16x4fs,Refine(OPQ56_112,PQ56)
$coarseD,PQ$((dim/2))x4fs
"
for indexkey in $indexkeys
do
key=autotune.db$db.${indexkey//,/_}
key="${key//(/_}"
key="${key//)/_}"
run_on_1machine $key.d \
python -u bench_all_ivf.py \
--db $db \
--indexkey "$indexkey" \
--maxtrain 0 \
--indexfile $indexdir/$key.faissindex \
--searchthreads 32 \
--min_test_duration 3 \
$( add_precomputed_quantizer $db $coarse ) \
--add_bs 1000000 \
--autotune_max nprobe:3000
done
done
done
fi

View File

@@ -0,0 +1,109 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import time
import faiss
import numpy as np
from faiss.contrib.datasets import SyntheticDataset
from faiss.contrib.big_batch_search import big_batch_search
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('dataset options')
aa('--dim', type=int, default=64)
aa('--size', default="S")
group = parser.add_argument_group('index options')
aa('--nlist', type=int, default=100)
aa('--factory_string', default="", help="overrides nlist")
aa('--k', type=int, default=10)
aa('--nprobe', type=int, default=5)
aa('--nt', type=int, default=-1, help="nb search threads")
aa('--method', default="pairwise_distances", help="")
args = parser.parse_args()
print("args:", args)
if args.size == "S":
ds = SyntheticDataset(32, 2000, 4000, 1000)
elif args.size == "M":
ds = SyntheticDataset(32, 20000, 40000, 10000)
elif args.size == "L":
ds = SyntheticDataset(32, 200000, 400000, 100000)
else:
raise RuntimeError(f"dataset size {args.size} not supported")
nlist = args.nlist
nprobe = args.nprobe
k = args.k
def tic(name):
global tictoc
tictoc = (name, time.time())
print(name, end="\r", flush=True)
def toc():
global tictoc
name, t0 = tictoc
dt = time.time() - t0
print(f"{name}: {dt:.3f} s")
return dt
print(f"dataset {ds}, {nlist=:} {nprobe=:} {k=:}")
if args.factory_string == "":
factory_string = f"IVF{nlist},Flat"
else:
factory_string = args.factory_string
print(f"instantiate {factory_string}")
index = faiss.index_factory(ds.d, factory_string)
if args.factory_string != "":
nlist = index.nlist
print("nlist", nlist)
tic("train")
index.train(ds.get_train())
toc()
tic("add")
index.add(ds.get_database())
toc()
if args.nt != -1:
print("setting nb of threads to", args.nt)
faiss.omp_set_num_threads(args.nt)
tic("reference search")
index.nprobe
index.nprobe = nprobe
Dref, Iref = index.search(ds.get_queries(), k)
t_ref = toc()
tic("block search")
Dnew, Inew = big_batch_search(
index, ds.get_queries(),
k, method=args.method, verbose=10
)
t_tot = toc()
assert (Inew != Iref).sum() / Iref.size < 1e-4
np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
print(f"total block search time {t_tot:.3f} s, speedup {t_ref / t_tot:.3f}x")

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,154 @@
#! /usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import numpy as np
import faiss
import time
import os
import argparse
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('dataset options')
aa('--dim', type=int, default=64)
aa('--nb', type=int, default=int(1e6))
aa('--subset_len', type=int, default=int(1e5))
aa('--key', default='IVF1000,Flat')
aa('--nprobe', type=int, default=640)
aa('--no_intcallback', default=False, action='store_true')
aa('--twostage', default=False, action='store_true')
aa('--nt', type=int, default=-1)
args = parser.parse_args()
print("args:", args)
d = args.dim # dimension
nb = args.nb # database size
nq = 1000 # nb of queries
nt = 100000
subset_len = args.subset_len
np.random.seed(1234) # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xq = np.random.random((nq, d)).astype('float32')
xt = np.random.random((nt, d)).astype('float32')
k = 100
if args.no_intcallback:
faiss.InterruptCallback.clear_instance()
if args.nt != -1:
faiss.omp_set_num_threads(args.nt)
nprobe = args.nprobe
key = args.key
#key = 'IVF1000,Flat'
# key = 'IVF1000,PQ64'
# key = 'IVF100_HNSW32,PQ64'
# faiss.omp_set_num_threads(1)
pf = 'dim%d_' % d
if d == 64:
pf = ''
basename = '/tmp/base%s%s.index' % (pf, key)
if os.path.exists(basename):
print('load', basename)
index_1 = faiss.read_index(basename)
else:
print('train + write', basename)
index_1 = faiss.index_factory(d, key)
index_1.train(xt)
faiss.write_index(index_1, basename)
print('add')
index_1.add(xb)
print('set nprobe=', nprobe)
faiss.ParameterSpace().set_index_parameter(index_1, 'nprobe', nprobe)
class ResultHeap:
""" Combine query results from a sliced dataset """
def __init__(self, nq, k):
" nq: number of query vectors, k: number of results per query "
self.I = np.zeros((nq, k), dtype='int64')
self.D = np.zeros((nq, k), dtype='float32')
self.nq, self.k = nq, k
heaps = faiss.float_maxheap_array_t()
heaps.k = k
heaps.nh = nq
heaps.val = faiss.swig_ptr(self.D)
heaps.ids = faiss.swig_ptr(self.I)
heaps.heapify()
self.heaps = heaps
def add_batch_result(self, D, I, i0):
assert D.shape == (self.nq, self.k)
assert I.shape == (self.nq, self.k)
I += i0
self.heaps.addn_with_ids(
self.k, faiss.swig_ptr(D),
faiss.swig_ptr(I), self.k)
def finalize(self):
self.heaps.reorder()
stats = faiss.cvar.indexIVF_stats
stats.reset()
print('index size', index_1.ntotal,
'imbalance', index_1.invlists.imbalance_factor())
start = time.time()
Dref, Iref = index_1.search(xq, k)
print('time of searching: %.3f s = %.3f + %.3f ms' % (
time.time() - start, stats.quantization_time, stats.search_time))
indexes = {}
if args.twostage:
for i in range(0, nb, subset_len):
index = faiss.read_index(basename)
faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
print("add %d:%d" %(i, i+subset_len))
index.add(xb[i:i + subset_len])
indexes[i] = index
rh = ResultHeap(nq, k)
sum_time = tq = ts = 0
for i in range(0, nb, subset_len):
if not args.twostage:
index = faiss.read_index(basename)
faiss.ParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
print("add %d:%d" %(i, i+subset_len))
index.add(xb[i:i + subset_len])
else:
index = indexes[i]
stats.reset()
start = time.time()
Di, Ii = index.search(xq, k)
sum_time = sum_time + time.time() - start
tq += stats.quantization_time
ts += stats.search_time
rh.add_batch_result(Di, Ii, i)
print('time of searching separately: %.3f s = %.3f + %.3f ms' %
(sum_time, tq, ts))
rh.finalize()
print('diffs: %d / %d' % ((Iref != rh.I).sum(), Iref.size))

View File

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,272 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import hashlib
import io
import json
import logging
import os
import pickle
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from zipfile import ZipFile
import faiss # @manual=//faiss/python:pyfaiss
import numpy as np
import submitit
from faiss.contrib.datasets import ( # @manual=//faiss/contrib:faiss_contrib
dataset_from_name,
)
logger = logging.getLogger(__name__)
# merge RCQ coarse quantizer and ITQ encoder to one Faiss index
def merge_rcq_itq(
# pyre-ignore[11]: `faiss.ResidualCoarseQuantizer` is not defined as a type
rcq_coarse_quantizer: faiss.ResidualCoarseQuantizer,
itq_encoder: faiss.IndexPreTransform,
# pyre-ignore[11]: `faiss.IndexIVFSpectralHash` is not defined as a type.
) -> faiss.IndexIVFSpectralHash:
# pyre-ignore[16]: `faiss` has no attribute `IndexIVFSpectralHash`.
index = faiss.IndexIVFSpectralHash(
rcq_coarse_quantizer,
rcq_coarse_quantizer.d,
rcq_coarse_quantizer.ntotal,
itq_encoder.sa_code_size() * 8,
1000000, # larger than the magnitude of the vectors
)
index.replace_vt(itq_encoder)
return index
@dataclass
class BenchmarkIO:
path: str # local path
def __init__(self, path: str):
self.path = path
self.cached_ds: Dict[Any, Any] = {}
def clone(self):
return BenchmarkIO(path=self.path)
def get_local_filepath(self, filename):
if len(filename) > 184:
fn, ext = os.path.splitext(filename)
filename = (
fn[:184] + hashlib.sha256(filename.encode()).hexdigest() + ext
)
return os.path.join(self.path, filename)
def get_remote_filepath(self, filename) -> Optional[str]:
return None
def download_file_from_blobstore(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
):
return self.get_local_filepath(filename)
def upload_file_to_blobstore(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
overwrite: bool = False,
):
pass
def file_exist(self, filename: str):
fn = self.get_local_filepath(filename)
exists = os.path.exists(fn)
logger.info(f"{filename} {exists=}")
return exists
def read_file(self, filename: str, keys: List[str]):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading file {fn}")
results = []
with ZipFile(fn, "r") as zip_file:
for key in keys:
with zip_file.open(key, "r") as f:
if key in ["D", "I", "R", "lims"]:
results.append(np.load(f))
elif key in ["P"]:
t = io.TextIOWrapper(f)
results.append(json.load(t))
else:
raise AssertionError()
return results
def write_file(
self,
filename: str,
keys: List[str],
values: List[Any],
overwrite: bool = False,
):
fn = self.get_local_filepath(filename)
with ZipFile(fn, "w") as zip_file:
for key, value in zip(keys, values, strict=True):
with zip_file.open(key, "w", force_zip64=True) as f:
if key in ["D", "I", "R", "lims"]:
np.save(f, value)
elif key in ["P"]:
t = io.TextIOWrapper(f, write_through=True)
json.dump(value, t)
else:
raise AssertionError()
self.upload_file_to_blobstore(filename, overwrite=overwrite)
def get_dataset(self, dataset):
if dataset not in self.cached_ds:
if (
dataset.namespace is not None
and dataset.namespace[:4] == "std_"
):
if dataset.tablename not in self.cached_ds:
self.cached_ds[dataset.tablename] = dataset_from_name(
dataset.tablename,
)
p = dataset.namespace[4]
if p == "t":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_train(dataset.num_vectors)
elif p == "d":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_database()
elif p == "q":
self.cached_ds[dataset] = self.cached_ds[
dataset.tablename
].get_queries()
else:
raise ValueError
elif dataset.namespace == "syn":
d, seed = dataset.tablename.split("_")
d = int(d)
seed = int(seed)
n = dataset.num_vectors
# based on faiss.contrib.datasets.SyntheticDataset
d1 = 10
rs = np.random.RandomState(seed)
x = rs.normal(size=(n, d1))
x = np.dot(x, rs.rand(d1, d))
x = x * (rs.rand(d) * 4 + 0.1)
x = np.sin(x)
x = x.astype(np.float32)
self.cached_ds[dataset] = x
else:
self.cached_ds[dataset] = self.read_nparray(
os.path.join(self.path, dataset.tablename),
mmap_mode="r",
)[: dataset.num_vectors].copy()
return self.cached_ds[dataset]
def read_nparray(
self,
filename: str,
mmap_mode: Optional[str] = None,
):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading nparray from {fn}")
nparray = np.load(fn, mmap_mode=mmap_mode)
logger.info(f"Loaded nparray {nparray.shape} from {fn}")
return nparray
def write_nparray(
self,
nparray: np.ndarray,
filename: str,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving nparray {nparray.shape} to {fn}")
np.save(fn, nparray)
self.upload_file_to_blobstore(filename)
def read_json(
self,
filename: str,
):
fn = self.download_file_from_blobstore(filename)
logger.info(f"Loading json {fn}")
with open(fn, "r") as fp:
json_dict = json.load(fp)
logger.info(f"Loaded json {json_dict} from {fn}")
return json_dict
def write_json(
self,
json_dict: dict[str, Any],
filename: str,
overwrite: bool = False,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving json {json_dict} to {fn}")
with open(fn, "w") as fp:
json.dump(json_dict, fp)
self.upload_file_to_blobstore(filename, overwrite=overwrite)
def read_index(
self,
filename: str,
bucket: Optional[str] = None,
path: Optional[str] = None,
):
fn = self.download_file_from_blobstore(filename, bucket, path)
logger.info(f"Loading index {fn}")
ext = os.path.splitext(fn)[1]
if ext in [".faiss", ".codec", ".index"]:
index = faiss.read_index(fn)
elif ext == ".pkl":
with open(fn, "rb") as model_file:
model = pickle.load(model_file)
rcq_coarse_quantizer, itq_encoder = model["model"]
index = merge_rcq_itq(rcq_coarse_quantizer, itq_encoder)
logger.info(f"Loaded index from {fn}")
return index
def write_index(
self,
index: faiss.Index,
filename: str,
):
fn = self.get_local_filepath(filename)
logger.info(f"Saving index to {fn}")
faiss.write_index(index, fn)
self.upload_file_to_blobstore(filename)
assert os.path.exists(fn)
return os.path.getsize(fn)
def launch_jobs(self, func, params, local=True):
if local:
results = [func(p) for p in params]
return results
logger.info(f"launching {len(params)} jobs")
executor = submitit.AutoExecutor(folder="/checkpoint/gsz/jobs")
executor.update_parameters(
nodes=1,
gpus_per_node=8,
cpus_per_task=80,
# mem_gb=640,
tasks_per_node=1,
name="faiss_benchmark",
slurm_array_parallelism=512,
slurm_partition="scavenge",
slurm_time=4 * 60,
slurm_constraint="bldg1",
)
jobs = executor.map_array(func, params)
logger.info(f"launched {len(jobs)} jobs")
for job, param in zip(jobs, params):
logger.info(f"{job.job_id=} {param[0]=}")
results = [job.result() for job in jobs]
print(f"received {len(results)} results")
return results

View File

@@ -0,0 +1,379 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import faiss # @manual=//faiss/python:pyfaiss
from .benchmark_io import BenchmarkIO
from .utils import timer
logger = logging.getLogger(__name__)
# Important: filenames end with . without extension (npy, codec, index),
# when writing files, you are required to filename + "npy" etc.
@dataclass
class IndexDescriptorClassic:
bucket: Optional[str] = None
# either path or factory should be set,
# but not both at the same time.
path: Optional[str] = None
factory: Optional[str] = None
codec_alias: Optional[str] = None
construction_params: Optional[List[Dict[str, int]]] = None
search_params: Optional[Dict[str, int]] = None
# range metric definitions
# key: name
# value: one of the following:
#
# radius
# [0..radius) -> 1
# [radius..inf) -> 0
#
# [[radius1, score1], ...]
# [0..radius1) -> score1
# [radius1..radius2) -> score2
#
# [[radius1_from, radius1_to, score1], ...]
# [radius1_from, radius1_to) -> score1,
# [radius2_from, radius2_to) -> score2
range_metrics: Optional[Dict[str, Any]] = None
radius: Optional[float] = None
training_size: Optional[int] = None
def __hash__(self):
return hash(str(self))
@dataclass
class DatasetDescriptor:
# namespace possible values:
# 1. a hive namespace
# 2. 'std_t', 'std_d', 'std_q' for the standard datasets
# via faiss.contrib.datasets.dataset_from_name()
# t - training, d - database, q - queries
# eg. "std_t"
# 3. 'syn' for synthetic data
# 4. None for local files
namespace: Optional[str] = None
# tablename possible values, corresponding to the
# namespace value above:
# 1. a hive table name
# 2. name of the standard dataset as recognized
# by faiss.contrib.datasets.dataset_from_name()
# eg. "bigann1M"
# 3. d_seed, eg. 128_1234 for 128 dimensional vectors
# with seed 1234
# 4. a local file name (relative to benchmark_io.path)
tablename: Optional[str] = None
# partition names and values for hive
# eg. ["ds=2021-09-01"]
partitions: Optional[List[str]] = None
# number of vectors to load from the dataset
num_vectors: Optional[int] = None
embedding_column: Optional[str] = None
# only when the embedding column is a map
embedding_column_key: Optional[Any] = None
embedding_id_column: Optional[str] = None
# filters on the dataset where each filter is a
# string rep of a filter expression
filters: Optional[List[str]] = None
# unused in open-source
splits_distribution: Optional[List[List[bytes]]] = None
# unused in open-source
splits: Optional[List[bytes]] = None
# unused in open-source
serialized_df: Optional[str] = None
sampling_rate: Optional[float] = None
# sampling column for xdb
sampling_column: Optional[str] = None
# blob store
bucket: Optional[str] = None
path: Optional[str] = None
# desc_name
desc_name: Optional[str] = None
normalize_L2: bool = False
def __hash__(self):
return hash(self.get_filename())
def get_filename(
self,
prefix: Optional[str] = None,
) -> str:
if self.desc_name is not None:
return self.desc_name
filename = ""
if prefix is not None:
filename += prefix + "_"
if self.namespace is not None:
filename += self.namespace + "_"
assert self.tablename is not None
filename += self.tablename
if self.partitions is not None:
filename += "_" + "_".join(
self.partitions
).replace("=", "_").replace("/", "_")
if self.num_vectors is not None:
filename += f"_{self.num_vectors}"
filename += "."
self.desc_name = filename
return self.desc_name
def get_kmeans_filename(self, k):
return f"{self.get_filename()}kmeans_{k}."
def k_means(self, io, k, dry_run):
logger.info(f"k_means {k} {self}")
kmeans_vectors = DatasetDescriptor(
tablename=f"{self.get_filename()}kmeans_{k}"
)
kmeans_filename = kmeans_vectors.get_filename() + "npy"
meta_filename = kmeans_vectors.get_filename() + "json"
if not io.file_exist(kmeans_filename) or not io.file_exist(
meta_filename
):
if dry_run:
return None, None, kmeans_filename
x = io.get_dataset(self)
kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
_, t, _ = timer("k_means", lambda: kmeans.train(x))
io.write_nparray(kmeans.centroids, kmeans_filename)
io.write_json({"k_means_time": t}, meta_filename)
else:
t = io.read_json(meta_filename)["k_means_time"]
return kmeans_vectors, t, None
@dataclass
class IndexBaseDescriptor:
d: int
metric: str
desc_name: Optional[str] = None
flat_desc_name: Optional[str] = None
bucket: Optional[str] = None
path: Optional[str] = None
num_threads: int = 1
def get_name(self) -> str:
raise NotImplementedError()
def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
if self.path is not None:
return self.path
self.path = benchmark_io.get_remote_filepath(self.desc_name)
return self.path
@staticmethod
def param_dict_list_to_name(param_dict_list):
if not param_dict_list:
return ""
l = 0
n = ""
for param_dict in param_dict_list:
n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
l += 1
return n
@staticmethod
def param_dict_to_name(param_dict, prefix="sp"):
if not param_dict:
return ""
n = prefix
for name, val in param_dict.items():
if name == "snap":
continue
if name == "lsq_gpu" and val == 0:
continue
if name == "use_beam_LUT" and val == 0:
continue
n += f"_{name}_{val}"
if n == prefix:
return ""
n += "."
return n
@dataclass
class CodecDescriptor(IndexBaseDescriptor):
# either path or factory should be set,
# but not both at the same time.
factory: Optional[str] = None
construction_params: Optional[List[Dict[str, int]]] = None
training_vectors: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xt"
def __post_init__(self):
self.get_name()
def is_trained(self):
return self.factory is None and self.path is not None
def is_valid(self):
return self.factory is not None or self.path is not None
def get_name(self) -> str:
if self.desc_name is not None:
return self.desc_name
if self.factory is not None:
self.desc_name = self.name_from_factory()
return self.desc_name
if self.path is not None:
self.desc_name = self.name_from_path()
return self.desc_name
raise ValueError("name, factory or path must be set")
def flat_name(self) -> str:
if self.flat_desc_name is not None:
return self.flat_desc_name
self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
return self.flat_desc_name
def path(self, benchmark_io) -> str:
if self.path is not None:
return self.path
return benchmark_io.get_remote_filepath(self.get_name())
def name_from_factory(self) -> str:
assert self.factory is not None
name = f"{self.factory.replace(',', '_')}."
assert self.d is not None
assert self.metric is not None
name += f"d_{self.d}.{self.metric.upper()}."
if self.factory != "Flat":
assert self.training_vectors is not None
name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX)
name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
return name
def name_from_path(self):
assert self.path is not None
filename = os.path.basename(self.path)
ext = filename.split(".")[-1]
if filename.endswith(ext):
name = filename[:-len(ext)]
else: # should never hit this rather raise value error
name = filename
return name
def alias(self, benchmark_io: BenchmarkIO):
if hasattr(benchmark_io, "bucket"):
return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
@dataclass
class IndexDescriptor(IndexBaseDescriptor):
codec_desc: Optional[CodecDescriptor] = None
database_desc: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xb"
def __hash__(self):
return hash(str(self))
def __post_init__(self):
self.get_name()
def is_built(self):
return self.codec_desc is None and self.database_desc is None
def get_name(self) -> str:
if self.desc_name is None:
self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
return self.desc_name
def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
return self.flat_desc_name
# alias is used to refer when index is uploaded to blobstore and refered again
def alias(self, benchmark_io: BenchmarkIO):
if hasattr(benchmark_io, "bucket"):
return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
@dataclass
class KnnDescriptor(IndexBaseDescriptor):
index_desc: Optional[IndexDescriptor] = None
gt_index_desc: Optional[IndexDescriptor] = None
query_dataset: Optional[DatasetDescriptor] = None
search_params: Optional[Dict[str, int]] = None
reconstruct: bool = False
FILENAME_PREFIX: str = "q"
# range metric definitions
# key: name
# value: one of the following:
#
# radius
# [0..radius) -> 1
# [radius..inf) -> 0
#
# [[radius1, score1], ...]
# [0..radius1) -> score1
# [radius1..radius2) -> score2
#
# [[radius1_from, radius1_to, score1], ...]
# [radius1_from, radius1_to) -> score1,
# [radius2_from, radius2_to) -> score2
range_metrics: Optional[Dict[str, Any]] = None
radius: Optional[float] = None
k: int = 1
range_ref_index_desc: Optional[str] = None
def __hash__(self):
return hash(str(self))
def get_name(self):
if self.desc_name is not None:
return self.desc_name
name = self.index_desc.get_name()
name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
name += "rec."
else:
name += "knn."
self.desc_name = name
return name
def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
name = self.index_desc.flat_name()
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
name += "rec."
else:
name += "knn."
self.flat_desc_name = name
return name

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,335 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from dataclasses import dataclass
from typing import Dict, List, Tuple
import faiss # @manual=//faiss/python:pyfaiss
# from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib
# OperatingPoints,
# )
from .benchmark import Benchmark
from .descriptors import DatasetDescriptor, IndexDescriptorClassic
from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
logger = logging.getLogger(__name__)
@dataclass
class Optimizer:
distance_metric: str = "L2"
num_threads: int = 32
run_local: bool = True
def __post_init__(self):
self.cached_benchmark = None
if self.distance_metric == "IP":
self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
elif self.distance_metric == "L2":
self.distance_metric_type = faiss.METRIC_L2
else:
raise ValueError
def set_io(self, benchmark_io):
self.io = benchmark_io
self.io.distance_metric = self.distance_metric
self.io.distance_metric_type = self.distance_metric_type
def benchmark_and_filter_candidates(
self,
index_descs,
training_vectors,
database_vectors,
query_vectors,
result_file,
include_flat,
min_accuracy,
pareto_metric,
):
benchmark = Benchmark(
num_threads=self.num_threads,
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
index_descs=index_descs,
k=10,
distance_metric=self.distance_metric,
)
benchmark.set_io(self.io)
results = benchmark.benchmark(
result_file=result_file, local=self.run_local, train=True, knn=True
)
assert results
filtered = filter_results(
results=results,
evaluation="knn",
accuracy_metric="knn_intersection",
min_accuracy=min_accuracy,
name_filter=None
if include_flat
else (lambda n: not n.startswith("Flat")),
pareto_mode=ParetoMode.GLOBAL,
pareto_metric=pareto_metric,
)
assert filtered
index_descs = [
IndexDescriptorClassic(
factory=v["factory"],
construction_params=v["construction_params"],
search_params=v["search_params"],
)
for _, _, _, _, v in filtered
]
return index_descs, filtered
def optimize_quantizer(
self,
training_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlists: List[int],
min_accuracy: float,
):
quantizer_descs = {}
for nlist in nlists:
# cluster
centroids, _, _ = training_vectors.k_means(
self.io,
nlist,
dry_run=False,
)
descs = [IndexDescriptorClassic(factory="Flat"),] + [
IndexDescriptorClassic(
factory="HNSW32",
construction_params=[{"efConstruction": 2**i}],
)
for i in range(6, 11)
]
descs, _ = self.benchmark_and_filter_candidates(
descs,
training_vectors=centroids,
database_vectors=centroids,
query_vectors=query_vectors,
result_file=f"result_{centroids.get_filename()}json",
include_flat=True,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME,
)
quantizer_descs[nlist] = descs
return quantizer_descs
def optimize_ivf(
self,
result_file: str,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
quantizers: Dict[int, List[IndexDescriptorClassic]],
codecs: List[Tuple[str, str]],
min_accuracy: float,
):
ivf_descs = []
for nlist, quantizer_descs in quantizers.items():
# build IVF index
for quantizer_desc in quantizer_descs:
for pretransform, fine_ivf in codecs:
if pretransform is None:
pretransform = ""
else:
pretransform = pretransform + ","
if quantizer_desc.construction_params is None:
construction_params = [
None,
quantizer_desc.search_params,
]
else:
construction_params = [
None
] + quantizer_desc.construction_params
if quantizer_desc.search_params is not None:
dict_merge(
construction_params[1],
quantizer_desc.search_params,
)
ivf_descs.append(
IndexDescriptorClassic(
factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
construction_params=construction_params,
)
)
return self.benchmark_and_filter_candidates(
ivf_descs,
training_vectors,
database_vectors,
query_vectors,
result_file,
include_flat=False,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME_SPACE,
)
# train an IVFFlat index
# find the nprobe required for the given accuracy
def ivf_flat_nprobe_required_for_accuracy(
self,
result_file: str,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlist,
accuracy,
):
_, results = self.benchmark_and_filter_candidates(
index_descs=[
IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
],
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
result_file=result_file,
include_flat=False,
min_accuracy=accuracy,
pareto_metric=ParetoMetric.TIME,
)
nprobe = nlist // 2
for _, _, _, k, v in results:
if (
".knn" in k
and "nprobe" in v["search_params"]
and v["knn_intersection"] >= accuracy
):
nprobe = min(nprobe, v["search_params"]["nprobe"])
return nprobe
# train candidate IVF codecs
# benchmark them at the same nprobe
# keep only the space _and_ time Pareto optimal
def optimize_codec(
self,
result_file: str,
d: int,
training_vectors: DatasetDescriptor,
database_vectors: DatasetDescriptor,
query_vectors: DatasetDescriptor,
nlist: int,
nprobe: int,
min_accuracy: float,
):
codecs = (
[
(None, "Flat"),
(None, "SQfp16"),
(None, "SQbf16"),
(None, "SQ8"),
(None, "SQ8_direct_signed"),
] + [
(f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
if d % M == 0
for dim in range(2, 18, 2)
if M * dim <= d
for b in range(4, 14, 2)
if M * b < d * 8 # smaller than SQ8
] + [
(None, f"PQ{M}x{b}")
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
if d % M == 0
for b in range(8, 14, 2)
if M * b < d * 8 # smaller than SQ8
]
)
factory = {}
for opq, pq in codecs:
factory[
f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
] = (
opq,
pq,
)
_, filtered = self.benchmark_and_filter_candidates(
index_descs=[
IndexDescriptorClassic(
factory=f"IVF{nlist},{pq}"
if opq is None
else f"{opq},IVF{nlist},{pq}",
search_params={
"nprobe": nprobe,
},
)
for opq, pq in codecs
],
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
result_file=result_file,
include_flat=False,
min_accuracy=min_accuracy,
pareto_metric=ParetoMetric.TIME_SPACE,
)
results = [
factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
]
return results
def optimize(
self,
d: int,
training_vectors: DatasetDescriptor,
database_vectors_list: List[DatasetDescriptor],
query_vectors: DatasetDescriptor,
min_accuracy: float,
):
# train an IVFFlat index
# find the nprobe required for near perfect accuracy
nlist = 4096
nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
result_file=f"result_ivf{nlist}_flat.json",
training_vectors=training_vectors,
database_vectors=database_vectors_list[0],
query_vectors=query_vectors,
nlist=nlist,
accuracy=0.95,
)
# train candidate IVF codecs
# benchmark them at the same nprobe
# keep only the space and time Pareto optima
codecs = self.optimize_codec(
result_file=f"result_ivf{nlist}_codec.json",
d=d,
training_vectors=training_vectors,
database_vectors=database_vectors_list[0],
query_vectors=query_vectors,
nlist=nlist,
nprobe=nprobe_at_95,
min_accuracy=min_accuracy,
)
# optimize coarse quantizers
quantizers = self.optimize_quantizer(
training_vectors=training_vectors,
query_vectors=query_vectors,
nlists=[4096, 8192, 16384, 32768],
min_accuracy=0.7,
)
# combine them with the codecs
# test them at different scales
for database_vectors in database_vectors_list:
self.optimize_ivf(
result_file=f"result_{database_vectors.get_filename()}json",
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
quantizers=quantizers,
codecs=codecs,
min_accuracy=min_accuracy,
)

View File

@@ -0,0 +1,248 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import functools
import logging
from enum import Enum
from multiprocessing.pool import ThreadPool
from time import perf_counter
import faiss # @manual=//faiss/python:pyfaiss
import numpy as np
from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib
OperatingPoints,
)
logger = logging.getLogger(__name__)
def timer(name, func, once=False) -> float:
logger.info(f"Measuring {name}")
t1 = perf_counter()
res = func()
t2 = perf_counter()
t = t2 - t1
repeat = 1
if not once and t < 1.0:
repeat = int(2.0 // t)
logger.info(
f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
)
t1 = perf_counter()
for _ in range(repeat):
res = func()
t2 = perf_counter()
t = (t2 - t1) / repeat
logger.info(f"Time for {name}: {t:.3f} seconds")
return res, t, repeat
def refine_distances_knn(
xq: np.ndarray,
xb: np.ndarray,
I: np.ndarray,
metric,
):
"""Recompute distances between xq[i] and xb[I[i, :]]"""
nq, k = I.shape
xq = np.ascontiguousarray(xq, dtype="float32")
nq2, d = xq.shape
xb = np.ascontiguousarray(xb, dtype="float32")
nb, d2 = xb.shape
I = np.ascontiguousarray(I, dtype="int64")
assert nq2 == nq
assert d2 == d
D = np.empty(I.shape, dtype="float32")
D[:] = np.inf
if metric == faiss.METRIC_L2:
faiss.fvec_L2sqr_by_idx(
faiss.swig_ptr(D),
faiss.swig_ptr(xq),
faiss.swig_ptr(xb),
faiss.swig_ptr(I),
d,
nq,
k,
)
else:
faiss.fvec_inner_products_by_idx(
faiss.swig_ptr(D),
faiss.swig_ptr(xq),
faiss.swig_ptr(xb),
faiss.swig_ptr(I),
d,
nq,
k,
)
return D
def refine_distances_range(
lims: np.ndarray,
D: np.ndarray,
I: np.ndarray,
xq: np.ndarray,
xb: np.ndarray,
metric,
):
with ThreadPool(32) as pool:
R = pool.map(
lambda i: (
np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
if metric == faiss.METRIC_L2
else np.tensordot(
xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
)
)
if lims[i + 1] > lims[i]
else [],
range(len(lims) - 1),
)
return np.hstack(R)
def distance_ratio_measure(I, R, D_GT, metric):
sum_of_R = np.sum(np.where(I >= 0, R, 0))
sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
if metric == faiss.METRIC_INNER_PRODUCT:
return (sum_of_R / sum_of_D_GT).item()
elif metric == faiss.METRIC_L2:
return (sum_of_D_GT / sum_of_R).item()
else:
raise RuntimeError(f"unknown metric {metric}")
@functools.cache
def get_cpu_info():
return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
13:
].strip()
def dict_merge(target, source):
for k, v in source.items():
if isinstance(v, dict) and k in target:
dict_merge(target[k], v)
else:
target[k] = v
class Cost:
def __init__(self, values):
self.values = values
def __le__(self, other):
return all(
v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
)
def __lt__(self, other):
return all(
v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
)
class ParetoMode(Enum):
DISABLE = 1 # no Pareto filtering
INDEX = 2 # index-local optima
GLOBAL = 3 # global optima
class ParetoMetric(Enum):
TIME = 0 # time vs accuracy
SPACE = 1 # space vs accuracy
TIME_SPACE = 2 # (time, space) vs accuracy
def range_search_recall_at_precision(experiment, precision):
return round(
max(
r
for r, p in zip(
experiment["range_search_pr"]["recall"],
experiment["range_search_pr"]["precision"],
)
if p > precision
),
6,
)
def filter_results(
results,
evaluation,
accuracy_metric, # str or func
time_metric=None, # func or None -> use default
space_metric=None, # func or None -> use default
min_accuracy=0,
max_space=0,
max_time=0,
scaling_factor=1.0,
name_filter=None, # func
pareto_mode=ParetoMode.DISABLE,
pareto_metric=ParetoMetric.TIME,
):
if isinstance(accuracy_metric, str):
accuracy_key = accuracy_metric
accuracy_metric = lambda v: v[accuracy_key]
if time_metric is None:
time_metric = lambda v: v["time"] * scaling_factor + (
v["quantizer"]["time"] if "quantizer" in v else 0
)
if space_metric is None:
space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
fe = []
ops = {}
if pareto_mode == ParetoMode.GLOBAL:
op = OperatingPoints()
ops["global"] = op
for k, v in results["experiments"].items():
if f".{evaluation}" in k:
accuracy = accuracy_metric(v)
if min_accuracy > 0 and accuracy < min_accuracy:
continue
space = space_metric(v)
if space is None:
space = 0
if max_space > 0 and space > max_space:
continue
time = time_metric(v)
if max_time > 0 and time > max_time:
continue
idx_name = v["index"] + (
"snap"
if "search_params" in v and v["search_params"]["snap"] == 1
else ""
)
if name_filter is not None and not name_filter(idx_name):
continue
experiment = (accuracy, space, time, k, v)
if pareto_mode == ParetoMode.DISABLE:
fe.append(experiment)
continue
if pareto_mode == ParetoMode.INDEX:
if idx_name not in ops:
ops[idx_name] = OperatingPoints()
op = ops[idx_name]
if pareto_metric == ParetoMetric.TIME:
op.add_operating_point(experiment, accuracy, time)
elif pareto_metric == ParetoMetric.SPACE:
op.add_operating_point(experiment, accuracy, space)
else:
op.add_operating_point(
experiment, accuracy, Cost([time, space])
)
if ops:
for op in ops.values():
for v, _, _ in op.operating_points:
fe.append(v)
fe.sort()
return fe

View File

@@ -0,0 +1,146 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import argparse
import os
from faiss.benchs.bench_fw.benchmark import Benchmark
from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
from faiss.benchs.bench_fw.index import IndexFromFactory
logging.basicConfig(level=logging.INFO)
def factory_factory(d):
return [
("SQ4", None, 256 * (2 ** 10), None),
("SQ8", None, 256 * (2 ** 10), None),
("SQfp16", None, 256 * (2 ** 10), None),
("ITQ64,LSH", None, 256 * (2 ** 10), None),
("Pad128,ITQ128,LSH", None, 256 * (2 ** 10), None),
("Pad256,ITQ256,LSH", None, 256 * (2 ** 10), None),
] + [
(f"OPQ32_128,Residual2x14,PQ32x{b}", None, 256 * (2 ** 14), None)
for b in range(8, 16, 2)
] + [
(f"PCAR{2 ** d_out},SQ{b}", None, 256 * (2 ** 10), None)
for d_out in range(6, 11)
if 2 ** d_out <= d
for b in [4, 8]
] + [
(f"OPQ{M}_{M * dim},PQ{M}x{b}", None, 256 * (2 ** b), None)
for M in [8, 12, 16, 32, 64, 128]
for dim in [2, 4, 6, 8, 12, 16]
if M * dim <= d
for b in range(8, 16, 2)
] + [
(f"RQ{cs // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl})
for cs in [64, 128, 256, 512]
for b in [6, 8, 10, 12]
for bs in [1, 2, 4, 8, 16, 32]
for bl in [0, 1]
if cs // b > 1
if cs // b < 65
if cs < d * 8 * 2
] + [
(f"LSQ{cs // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg})
for cs in [64, 128, 256, 512]
for b in [6, 8, 10, 12]
for eii in [2, 4, 8, 16]
for lg in [0, 1]
if cs // b > 1
if cs // b < 65
if cs < d * 8 * 2
] + [
(f"PRQ{sub}x{cs // sub // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl})
for sub in [2, 3, 4, 8, 16, 32]
for cs in [64, 96, 128, 192, 256, 384, 512, 768, 1024, 2048]
for b in [6, 8, 10, 12]
for bs in [1, 2, 4, 8, 16, 32]
for bl in [0, 1]
if cs // sub // b > 1
if cs // sub // b < 65
if cs < d * 8 * 2
if d % sub == 0
] + [
(f"PLSQ{sub}x{cs // sub // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg})
for sub in [2, 3, 4, 8, 16, 32]
for cs in [64, 128, 256, 512, 1024, 2048]
for b in [6, 8, 10, 12]
for eii in [2, 4, 8, 16]
for lg in [0, 1]
if cs // sub // b > 1
if cs // sub // b < 65
if cs < d * 8 * 2
if d % sub == 0
]
def run_local(rp):
bio, d, tablename, distance_metric = rp
if tablename == "contriever":
training_vectors=DatasetDescriptor(
tablename="training_set.npy"
)
database_vectors=DatasetDescriptor(
tablename="database1M.npy",
)
query_vectors=DatasetDescriptor(
tablename="queries.npy",
)
else:
training_vectors=DatasetDescriptor(
namespace="std_t", tablename=tablename,
)
database_vectors=DatasetDescriptor(
namespace="std_d", tablename=tablename,
)
query_vectors=DatasetDescriptor(
namespace="std_q", tablename=tablename,
)
benchmark = Benchmark(
num_threads=32,
training_vectors=training_vectors,
database_vectors=database_vectors,
query_vectors=query_vectors,
index_descs=[
IndexDescriptorClassic(
factory=factory,
construction_params=construction_params,
training_size=training_size,
search_params=search_params,
)
for factory, construction_params, training_size, search_params in factory_factory(d)
],
k=1,
distance_metric=distance_metric,
)
benchmark.set_io(bio)
benchmark.benchmark(result_file="result.json", train=True, reconstruct=False, knn=False, range=False)
def run(bio, d, tablename, distance_metric):
bio.launch_jobs(run_local, [(bio, d, tablename, distance_metric)], local=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('experiment')
parser.add_argument('path')
args = parser.parse_args()
assert os.path.exists(args.path)
path = os.path.join(args.path, args.experiment)
if not os.path.exists(path):
os.mkdir(path)
bio = BenchmarkIO(
path=path,
)
if args.experiment == "sift1M":
run(bio, 128, "sift1M", "L2")
elif args.experiment == "bigann":
run(bio, 128, "bigann1M", "L2")
elif args.experiment == "deep1b":
run(bio, 96, "deep1M", "L2")
elif args.experiment == "contriever":
run(bio, 768, "contriever", "IP")

View File

@@ -0,0 +1,125 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import logging
import os
from faiss.benchs.bench_fw.benchmark import Benchmark
from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
from faiss.benchs.bench_fw.descriptors import (
DatasetDescriptor,
IndexDescriptorClassic,
)
logging.basicConfig(level=logging.INFO)
def sift1M(bio):
benchmark = Benchmark(
num_threads=32,
training_vectors=DatasetDescriptor(
namespace="std_d", tablename="sift1M"
),
database_vectors=DatasetDescriptor(
namespace="std_d", tablename="sift1M"
),
query_vectors=DatasetDescriptor(
namespace="std_q", tablename="sift1M"
),
index_descs=[
IndexDescriptorClassic(
factory=f"IVF{2 ** nlist},Flat",
)
for nlist in range(8, 15)
],
k=1,
distance_metric="L2",
)
benchmark.io = bio
benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False)
def bigann(bio):
for scale in [1, 2, 5, 10, 20, 50]:
benchmark = Benchmark(
num_threads=32,
training_vectors=DatasetDescriptor(
namespace="std_t", tablename="bigann1M"
),
database_vectors=DatasetDescriptor(
namespace="std_d", tablename=f"bigann{scale}M"
),
query_vectors=DatasetDescriptor(
namespace="std_q", tablename="bigann1M"
),
index_descs=[
IndexDescriptorClassic(
factory=f"IVF{2 ** nlist},Flat",
) for nlist in range(11, 19)
] + [
IndexDescriptorClassic(
factory=f"IVF{2 ** nlist}_HNSW32,Flat",
construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
) for nlist in range(11, 19)
],
k=1,
distance_metric="L2",
)
benchmark.set_io(bio)
benchmark.benchmark(f"result{scale}.json", local=False, train=True, reconstruct=False, knn=True, range=False)
def ssnpp(bio):
benchmark = Benchmark(
num_threads=32,
training_vectors=DatasetDescriptor(
tablename="ssnpp_training_5M.npy"
),
database_vectors=DatasetDescriptor(
tablename="ssnpp_database_5M.npy"
),
query_vectors=DatasetDescriptor(
tablename="ssnpp_queries_10K.npy"
),
index_descs=[
IndexDescriptorClassic(
factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
) for nlist in range(9, 16)
] + [
IndexDescriptorClassic(
factory=f"IVF{2 ** nlist},Flat",
) for nlist in range(9, 16)
] + [
IndexDescriptorClassic(
factory=f"PQ256x4fs,Refine(SQfp16)",
),
IndexDescriptorClassic(
factory=f"HNSW32",
),
],
k=1,
distance_metric="L2",
)
benchmark.set_io(bio)
benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('experiment')
parser.add_argument('path')
args = parser.parse_args()
assert os.path.exists(args.path)
path = os.path.join(args.path, args.experiment)
if not os.path.exists(path):
os.mkdir(path)
bio = BenchmarkIO(
path=path,
)
if args.experiment == "sift1M":
sift1M(bio)
elif args.experiment == "bigann":
bigann(bio)
elif args.experiment == "ssnpp":
ssnpp(bio)

View File

@@ -0,0 +1,532 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "be081589-e1b2-4569-acb7-44203e273899",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import itertools\n",
"from faiss.contrib.evaluation import OperatingPoints\n",
"from enum import Enum\n",
"from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO as BIO\n",
"from faiss.benchs.bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
"from copy import copy\n",
"import numpy as np\n",
"import datetime\n",
"import glob\n",
"import io\n",
"import json\n",
"from zipfile import ZipFile\n",
"import tabulate"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import getpass\n",
"username = getpass.getuser()\n",
"root = f\"/home/{username}/simsearch/data/ivf/results/sift1M\"\n",
"results = BIO(root).read_json(\"result.json\")\n",
"results.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0875d269-aef4-426d-83dd-866970f43777",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"results['experiments']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f080a6e2-1565-418b-8732-4adeff03a099",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
" if plot is None:\n",
" plot = plt.subplot()\n",
" x = {}\n",
" y = {}\n",
" for accuracy, space, time, k, v in experiments:\n",
" idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
" if idx_name not in x:\n",
" x[idx_name] = []\n",
" y[idx_name] = []\n",
" x[idx_name].append(accuracy)\n",
" if plot_space:\n",
" y[idx_name].append(space)\n",
" else:\n",
" y[idx_name].append(time)\n",
"\n",
" #plt.figure(figsize=(10,6))\n",
" #plt.title(accuracy_title)\n",
" plot.set_xlabel(accuracy_title)\n",
" plot.set_ylabel(cost_title)\n",
" plot.set_yscale(\"log\")\n",
" marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\")) \n",
" for index in x.keys():\n",
" plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
" plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61007155-5edc-449e-835e-c141a01a2ae5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# index local optima\n",
"accuracy_metric = \"knn_intersection\"\n",
"fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
"plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# global optima\n",
"accuracy_metric = \"knn_intersection\"\n",
"fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.25, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
"#fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
"plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
"metadata": {},
"outputs": [],
"source": [
"def pretty_params(p):\n",
" p = copy(p)\n",
" if 'snap' in p and p['snap'] == 0:\n",
" del p['snap']\n",
" return p\n",
" \n",
"tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
" for accuracy, space, time, k, v in fr],\n",
" tablefmt=\"html\",\n",
" headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36e82084-18f6-4546-a717-163eb0224ee8",
"metadata": {},
"outputs": [],
"source": [
"# index local optima @ precision 0.8\n",
"precision = 0.8\n",
"accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
"fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
"plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
"metadata": {},
"outputs": [],
"source": [
"# index local optima @ precision 0.2\n",
"precision = 0.2\n",
"accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
"fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
"plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
"metadata": {},
"outputs": [],
"source": [
"# global optima @ precision 0.8\n",
"precision = 0.8\n",
"accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
"fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
"plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9aead830-6209-4956-b7ea-4a5e0029d616",
"metadata": {},
"outputs": [],
"source": [
"def plot_range_search_pr_curves(experiments):\n",
" x = {}\n",
" y = {}\n",
" show = {\n",
" 'Flat': None,\n",
" }\n",
" for _, _, _, k, v in fr:\n",
" if \".weighted\" in k: # and v['index'] in show:\n",
" x[k] = v['range_search_pr']['recall']\n",
" y[k] = v['range_search_pr']['precision']\n",
" \n",
" plt.title(\"range search recall\")\n",
" plt.xlabel(\"recall\")\n",
" plt.ylabel(\"precision\")\n",
" for index in x.keys():\n",
" plt.plot(x[index], y[index], '.', label=index)\n",
" plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92e45502-7a31-4a15-90df-fa3032d7d350",
"metadata": {},
"outputs": [],
"source": [
"precision = 0.8\n",
"accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
"fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
"plot_range_search_pr_curves(fr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
"scales = [1, 2, 5, 10, 20, 50]\n",
"fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
"fig.tight_layout()\n",
"for plot, scale in zip(plots, scales, strict=True):\n",
" results = BIO(root).read_json(f\"result{scale}.json\")\n",
" accuracy_metric = \"knn_intersection\"\n",
" fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
" plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e503828c-ee61-45f7-814b-cce6461109bc",
"metadata": {},
"outputs": [],
"source": [
"x = {}\n",
"y = {}\n",
"accuracy=0.9\n",
"root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
"scales = [1, 2, 5, 10, 20, 50]\n",
"#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
"#fig.tight_layout()\n",
"for scale in scales:\n",
" results = BIO(root).read_json(f\"result{scale}.json\")\n",
" scale *= 1_000_000\n",
" accuracy_metric = \"knn_intersection\"\n",
" fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
" seen = set()\n",
" print(scale)\n",
" for _, _, _, _, exp in fr:\n",
" fact = exp[\"factory\"]\n",
" # \"HNSW\" in fact or \n",
" if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
" continue\n",
" seen.add(fact)\n",
" if fact not in x:\n",
" x[fact] = []\n",
" y[fact] = []\n",
" x[fact].append(scale)\n",
" y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
" if (exp[\"knn_intersection\"] > 0.92):\n",
" print(fact)\n",
" print(exp[\"search_params\"])\n",
" print(exp[\"knn_intersection\"])\n",
"\n",
" #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
" \n",
"plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
"plt.xlabel(\"database size\")\n",
"plt.ylabel(\"time\")\n",
"plt.xscale(\"log\")\n",
"plt.yscale(\"log\")\n",
"\n",
"marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\")) \n",
"for index in x.keys():\n",
" if \"HNSW\" in index:\n",
" plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
" else:\n",
" plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
"plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
"metadata": {},
"outputs": [],
"source": [
"# global optima\n",
"accuracy_metric = \"sym_recall\"\n",
"fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
"plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
"metadata": {},
"outputs": [],
"source": [
"def pretty_time(s):\n",
" if s is None:\n",
" return \"None\"\n",
" s = int(s * 1000) / 1000\n",
" m, s = divmod(s, 60)\n",
" h, m = divmod(m, 60)\n",
" d, h = divmod(h, 24)\n",
" r = \"\"\n",
" if d > 0:\n",
" r += f\"{int(d)}d \"\n",
" if h > 0:\n",
" r += f\"{int(h)}h \"\n",
" if m > 0:\n",
" r += f\"{int(m)}m \"\n",
" if s > 0 or len(r) == 0:\n",
" r += f\"{s:.3f}s\"\n",
" return r\n",
"\n",
"def pretty_size(s):\n",
" if s > 1024 * 1024:\n",
" return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
" if s > 1024:\n",
" return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
" return f\"{s}\"\n",
"\n",
"def pretty_mse(m):\n",
" if m is None:\n",
" return \"None\"\n",
" else:\n",
" return f\"{m:.6f}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
"metadata": {},
"outputs": [],
"source": [
"data = {}\n",
"root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
"scales = [1, 2, 5, 10, 20, 50]\n",
"for scale in scales:\n",
" results = BIO(root).read_json(f\"result{scale}.json\")\n",
" accuracy_metric = \"knn_intersection\"\n",
" fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
" d = {}\n",
" data[f\"{scale}M\"] = d\n",
" for _, _, _, _, exp in fr:\n",
" fact = exp[\"factory\"]\n",
" # \"HNSW\" in fact or \n",
" if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
" continue\n",
" if fact not in d:\n",
" d[fact] = []\n",
" d[fact].append({\n",
" \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
" \"recall\": exp[\"knn_intersection\"],\n",
" \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
" })\n",
"data\n",
"# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
"# json.dump(data, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
"metadata": {},
"outputs": [],
"source": [
"ds = \"deep1b\"\n",
"data = []\n",
"jss = []\n",
"root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
"results = BIO(root).read_json(f\"result.json\")\n",
"for k, e in results[\"experiments\"].items():\n",
" if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
" code_size = results['indices'][e['codec']]['sa_code_size']\n",
" codec_size = results['indices'][e['codec']]['codec_size']\n",
" training_time = results['indices'][e['codec']]['training_time']\n",
" # training_size = results['indices'][e['codec']]['training_size']\n",
" cpu = e['cpu'] if 'cpu' in e else \"\"\n",
" ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
" eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
" data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
" jss.append({\n",
" 'factory': e['factory'],\n",
" 'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
" 'evaluation_params': e['reconstruct_params'],\n",
" 'code_size': code_size,\n",
" 'codec_size': codec_size,\n",
" 'training_time': training_time,\n",
" 'training_size': training_size,\n",
" 'mse': e['mse'],\n",
" 'sym_recall': e['sym_recall'],\n",
" 'asym_recall': e['asym_recall'],\n",
" 'encode_time': e['encode_time'],\n",
" 'decode_time': e['decode_time'],\n",
" 'cpu': cpu,\n",
" })\n",
"\n",
"print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
"print(\"|-|-|-|-|-|-|-|-|-|\")\n",
"data.sort()\n",
"for d in data:\n",
" print(d[1])\n",
"\n",
"with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
" json.dump(jss, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1216733-9670-407c-b3d2-5f87bce0321c",
"metadata": {},
"outputs": [],
"source": [
"def read_file(filename: str, keys):\n",
" results = []\n",
" with ZipFile(filename, \"r\") as zip_file:\n",
" for key in keys:\n",
" with zip_file.open(key, \"r\") as f:\n",
" if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
" results.append(np.load(f))\n",
" elif key in [\"P\"]:\n",
" t = io.TextIOWrapper(f)\n",
" results.append(json.load(t))\n",
" else:\n",
" raise AssertionError()\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
"metadata": {},
"outputs": [],
"source": [
"ds = \"contriever\"\n",
"data = []\n",
"jss = []\n",
"root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
"for lf in glob.glob(root + '/*rec*.zip'):\n",
" e, = read_file(lf, ['P'])\n",
" if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
" code_size = e['codec_meta']['sa_code_size']\n",
" codec_size = e['codec_meta']['codec_size']\n",
" training_time = e['codec_meta']['training_time']\n",
" training_size = None # e['codec_meta']['training_size']\n",
" cpu = e['cpu'] if 'cpu' in e else \"\"\n",
" ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
" eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
" if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
" eps = \" \"\n",
" data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
" eps = e['reconstruct_params']\n",
" del eps['snap']\n",
" params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
" for k, v in e['reconstruct_params'].items():\n",
" params[k] = v\n",
" jss.append({\n",
" 'factory': e['factory'],\n",
" 'params': params,\n",
" 'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
" 'evaluation_params': e['reconstruct_params'],\n",
" 'code_size': code_size,\n",
" 'codec_size': codec_size,\n",
" 'training_time': training_time,\n",
" # 'training_size': training_size,\n",
" 'mse': e['mse'],\n",
" 'sym_recall': e['sym_recall'],\n",
" 'asym_recall': e['asym_recall'],\n",
" 'encode_time': e['encode_time'],\n",
" 'decode_time': e['decode_time'],\n",
" 'cpu': cpu,\n",
" })\n",
"\n",
"print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
"print(\"|-|-|-|-|-|-|-|-|-|\")\n",
"data.sort()\n",
"# for d in data:\n",
"# print(d[1])\n",
"\n",
"print(len(data))\n",
"\n",
"with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
" json.dump(jss, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "faiss_binary (local)",
"language": "python",
"name": "faiss_binary_local"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,58 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import logging
import os
from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
from faiss.benchs.bench_fw.descriptors import DatasetDescriptor
from faiss.benchs.bench_fw.optimize import Optimizer
logging.basicConfig(level=logging.INFO)
def bigann(bio):
optimizer = Optimizer(
distance_metric="L2",
num_threads=32,
run_local=False,
)
optimizer.set_io(bio)
query_vectors = DatasetDescriptor(namespace="std_q", tablename="bigann1M")
xt = bio.get_dataset(query_vectors)
optimizer.optimize(
d=xt.shape[1],
training_vectors=DatasetDescriptor(
namespace="std_t",
tablename="bigann1M",
num_vectors=2_000_000,
),
database_vectors_list=[
DatasetDescriptor(
namespace="std_d",
tablename="bigann1M",
),
DatasetDescriptor(namespace="std_d", tablename="bigann10M"),
],
query_vectors=query_vectors,
min_accuracy=0.85,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("experiment")
parser.add_argument("path")
args = parser.parse_args()
assert os.path.exists(args.path)
path = os.path.join(args.path, args.experiment)
if not os.path.exists(path):
os.mkdir(path)
bio = BenchmarkIO(
path=path,
)
if args.experiment == "bigann":
bigann(bio)

View File

@@ -0,0 +1,85 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import logging
import os
from faiss.benchs.bench_fw.benchmark import Benchmark
from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
logging.basicConfig(level=logging.INFO)
def ssnpp(bio):
benchmark = Benchmark(
num_threads=32,
training_vectors=DatasetDescriptor(
tablename="training.npy",
),
database_vectors=DatasetDescriptor(
tablename="database.npy",
),
query_vectors=DatasetDescriptor(tablename="query.npy"),
index_descs=[
IndexDescriptorClassic(
factory="Flat",
range_metrics={
"weighted": [
[0.05, 0.971],
[0.1, 0.956],
[0.15, 0.923],
[0.2, 0.887],
[0.25, 0.801],
[0.3, 0.729],
[0.35, 0.651],
[0.4, 0.55],
[0.45, 0.459],
[0.5, 0.372],
[0.55, 0.283],
[0.6, 0.189],
[0.65, 0.143],
[0.7, 0.106],
[0.75, 0.116],
[0.8, 0.088],
[0.85, 0.064],
[0.9, 0.05],
[0.95, 0.04],
[1.0, 0.028],
[1.05, 0.02],
[1.1, 0.013],
[1.15, 0.007],
[1.2, 0.004],
[1.3, 0],
]
},
),
IndexDescriptorClassic(
factory="IVF262144(PQ256x4fs),PQ32",
),
],
k=10,
distance_metric="L2",
range_ref_index_desc="Flat",
)
benchmark.set_io(bio)
benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('experiment')
parser.add_argument('path')
args = parser.parse_args()
assert os.path.exists(args.path)
path = os.path.join(args.path, args.experiment)
if not os.path.exists(path):
os.mkdir(path)
bio = BenchmarkIO(
path=path,
)
if args.experiment == "ssnpp":
ssnpp(bio)

View File

@@ -0,0 +1,746 @@
#! /usr/bin/env python2
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import numpy as np
import time
import os
import sys
import faiss
import re
from multiprocessing.pool import ThreadPool
from datasets import ivecs_read
####################################################################
# Parse command line
####################################################################
def usage():
print("""
Usage: bench_gpu_1bn.py dataset indextype [options]
dataset: set of vectors to operate on.
Supported: SIFT1M, SIFT2M, ..., SIFT1000M or Deep1B
indextype: any index type supported by index_factory that runs on GPU.
General options
-ngpu ngpu nb of GPUs to use (default = all)
-tempmem N use N bytes of temporary GPU memory
-nocache do not read or write intermediate files
-float16 use 16-bit floats on the GPU side
Add options
-abs N split adds in blocks of no more than N vectors
-max_add N copy sharded dataset to CPU each max_add additions
(to avoid memory overflows with geometric reallocations)
-altadd Alternative add function, where the index is not stored
on GPU during add. Slightly faster for big datasets on
slow GPUs
Search options
-R R: nb of replicas of the same dataset (the dataset
will be copied across ngpu/R, default R=1)
-noptables do not use precomputed tables in IVFPQ.
-qbs N split queries in blocks of no more than N vectors
-nnn N search N neighbors for each query
-nprobe 4,16,64 try this number of probes
-knngraph instead of the standard setup for the dataset,
compute a k-nn graph with nnn neighbors per element
-oI xx%d.npy output the search result indices to this numpy file,
%d will be replaced with the nprobe
-oD xx%d.npy output the search result distances to this file
""", file=sys.stderr)
sys.exit(1)
# default values
dbname = None
index_key = None
ngpu = faiss.get_num_gpus()
replicas = 1 # nb of replicas of sharded dataset
add_batch_size = 32768
query_batch_size = 16384
nprobes = [1 << l for l in range(9)]
knngraph = False
use_precomputed_tables = True
tempmem = -1 # if -1, use system default
max_add = -1
use_float16 = False
use_cache = True
nnn = 10
altadd = False
I_fname = None
D_fname = None
args = sys.argv[1:]
while args:
a = args.pop(0)
if a == '-h': usage()
elif a == '-ngpu': ngpu = int(args.pop(0))
elif a == '-R': replicas = int(args.pop(0))
elif a == '-noptables': use_precomputed_tables = False
elif a == '-abs': add_batch_size = int(args.pop(0))
elif a == '-qbs': query_batch_size = int(args.pop(0))
elif a == '-nnn': nnn = int(args.pop(0))
elif a == '-tempmem': tempmem = int(args.pop(0))
elif a == '-nocache': use_cache = False
elif a == '-knngraph': knngraph = True
elif a == '-altadd': altadd = True
elif a == '-float16': use_float16 = True
elif a == '-nprobe': nprobes = [int(x) for x in args.pop(0).split(',')]
elif a == '-max_add': max_add = int(args.pop(0))
elif not dbname: dbname = a
elif not index_key: index_key = a
else:
print("argument %s unknown" % a, file=sys.stderr)
sys.exit(1)
cacheroot = '/tmp/bench_gpu_1bn'
if not os.path.isdir(cacheroot):
print("%s does not exist, creating it" % cacheroot)
os.mkdir(cacheroot)
#################################################################
# Small Utility Functions
#################################################################
# we mem-map the biggest files to avoid having them in memory all at
# once
def mmap_fvecs(fname):
x = np.memmap(fname, dtype='int32', mode='r')
d = x[0]
return x.view('float32').reshape(-1, d + 1)[:, 1:]
def mmap_bvecs(fname):
x = np.memmap(fname, dtype='uint8', mode='r')
d = x[:4].view('int32')[0]
return x.reshape(-1, d + 4)[:, 4:]
def rate_limited_imap(f, l):
"""A threaded imap that does not produce elements faster than they
are consumed"""
pool = ThreadPool(1)
res = None
for i in l:
res_next = pool.apply_async(f, (i, ))
if res:
yield res.get()
res = res_next
yield res.get()
class IdentPreproc:
"""a pre-processor is either a faiss.VectorTransform or an IndentPreproc"""
def __init__(self, d):
self.d_in = self.d_out = d
def apply_py(self, x):
return x
def sanitize(x):
""" convert array to a c-contiguous float array """
return np.ascontiguousarray(x.astype('float32'))
def dataset_iterator(x, preproc, bs):
""" iterate over the lines of x in blocks of size bs"""
nb = x.shape[0]
block_ranges = [(i0, min(nb, i0 + bs))
for i0 in range(0, nb, bs)]
def prepare_block(i01):
i0, i1 = i01
xb = sanitize(x[i0:i1])
return i0, preproc.apply_py(xb)
return rate_limited_imap(prepare_block, block_ranges)
def eval_intersection_measure(gt_I, I):
""" measure intersection measure (used for knngraph)"""
inter = 0
rank = I.shape[1]
assert gt_I.shape[1] >= rank
for q in range(nq_gt):
inter += faiss.ranklist_intersection_size(
rank, faiss.swig_ptr(gt_I[q, :]),
rank, faiss.swig_ptr(I[q, :].astype('int64')))
return inter / float(rank * nq_gt)
#################################################################
# Prepare dataset
#################################################################
print("Preparing dataset", dbname)
if dbname.startswith('SIFT'):
# SIFT1M to SIFT1000M
dbsize = int(dbname[4:-1])
xb = mmap_bvecs('bigann/bigann_base.bvecs')
xq = mmap_bvecs('bigann/bigann_query.bvecs')
xt = mmap_bvecs('bigann/bigann_learn.bvecs')
# trim xb to correct size
xb = xb[:dbsize * 1000 * 1000]
gt_I = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
elif dbname == 'Deep1B':
xb = mmap_fvecs('deep1b/base.fvecs')
xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
xt = mmap_fvecs('deep1b/learn.fvecs')
# deep1B's train is is outrageously big
xt = xt[:10 * 1000 * 1000]
gt_I = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
else:
print('unknown dataset', dbname, file=sys.stderr)
sys.exit(1)
if knngraph:
# convert to knn-graph dataset
xq = xb
xt = xb
# we compute the ground-truth on this number of queries for validation
nq_gt = 10000
gt_sl = 100
# ground truth will be computed below
gt_I = None
print("sizes: B %s Q %s T %s gt %s" % (
xb.shape, xq.shape, xt.shape,
gt_I.shape if gt_I is not None else None))
#################################################################
# Parse index_key and set cache files
#
# The index_key is a valid factory key that would work, but we
# decompose the training to do it faster
#################################################################
pat = re.compile('(OPQ[0-9]+(_[0-9]+)?,|PCAR[0-9]+,)?' +
'(IVF[0-9]+),' +
'(PQ[0-9]+|Flat)')
matchobject = pat.match(index_key)
assert matchobject, 'could not parse ' + index_key
mog = matchobject.groups()
preproc_str = mog[0]
ivf_str = mog[2]
pqflat_str = mog[3]
ncent = int(ivf_str[3:])
prefix = ''
if knngraph:
gt_cachefile = '%s/BK_gt_%s.npy' % (cacheroot, dbname)
prefix = 'BK_'
# files must be kept distinct because the training set is not the
# same for the knngraph
if preproc_str:
preproc_cachefile = '%s/%spreproc_%s_%s.vectrans' % (
cacheroot, prefix, dbname, preproc_str[:-1])
else:
preproc_cachefile = None
preproc_str = ''
cent_cachefile = '%s/%scent_%s_%s%s.npy' % (
cacheroot, prefix, dbname, preproc_str, ivf_str)
index_cachefile = '%s/%s%s_%s%s,%s.index' % (
cacheroot, prefix, dbname, preproc_str, ivf_str, pqflat_str)
if not use_cache:
preproc_cachefile = None
cent_cachefile = None
index_cachefile = None
print("cachefiles:")
print(preproc_cachefile)
print(cent_cachefile)
print(index_cachefile)
#################################################################
# Wake up GPUs
#################################################################
print("preparing resources for %d GPUs" % ngpu)
gpu_resources = []
for i in range(ngpu):
res = faiss.StandardGpuResources()
if tempmem >= 0:
res.setTempMemory(tempmem)
gpu_resources.append(res)
def make_vres_vdev(i0=0, i1=-1):
" return vectors of device ids and resources useful for gpu_multiple"
vres = faiss.GpuResourcesVector()
vdev = faiss.IntVector()
if i1 == -1:
i1 = ngpu
for i in range(i0, i1):
vdev.push_back(i)
vres.push_back(gpu_resources[i])
return vres, vdev
#################################################################
# Prepare ground truth (for the knngraph)
#################################################################
def compute_GT():
print("compute GT")
t0 = time.time()
gt_I = np.zeros((nq_gt, gt_sl), dtype='int64')
gt_D = np.zeros((nq_gt, gt_sl), dtype='float32')
heaps = faiss.float_maxheap_array_t()
heaps.k = gt_sl
heaps.nh = nq_gt
heaps.val = faiss.swig_ptr(gt_D)
heaps.ids = faiss.swig_ptr(gt_I)
heaps.heapify()
bs = 10 ** 5
n, d = xb.shape
xqs = sanitize(xq[:nq_gt])
db_gt = faiss.IndexFlatL2(d)
vres, vdev = make_vres_vdev()
db_gt_gpu = faiss.index_cpu_to_gpu_multiple(
vres, vdev, db_gt)
# compute ground-truth by blocks of bs, and add to heaps
for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs):
db_gt_gpu.add(xsl)
D, I = db_gt_gpu.search(xqs, gt_sl)
I += i0
heaps.addn_with_ids(
gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl)
db_gt_gpu.reset()
print("\r %d/%d, %.3f s" % (i0, n, time.time() - t0), end=' ')
print()
heaps.reorder()
print("GT time: %.3f s" % (time.time() - t0))
return gt_I
if knngraph:
if gt_cachefile and os.path.exists(gt_cachefile):
print("load GT", gt_cachefile)
gt_I = np.load(gt_cachefile)
else:
gt_I = compute_GT()
if gt_cachefile:
print("store GT", gt_cachefile)
np.save(gt_cachefile, gt_I)
#################################################################
# Prepare the vector transformation object (pure CPU)
#################################################################
def train_preprocessor():
print("train preproc", preproc_str)
d = xt.shape[1]
t0 = time.time()
if preproc_str.startswith('OPQ'):
fi = preproc_str[3:-1].split('_')
m = int(fi[0])
dout = int(fi[1]) if len(fi) == 2 else d
preproc = faiss.OPQMatrix(d, m, dout)
elif preproc_str.startswith('PCAR'):
dout = int(preproc_str[4:-1])
preproc = faiss.PCAMatrix(d, dout, 0, True)
else:
assert False
preproc.train(sanitize(xt[:1000000]))
print("preproc train done in %.3f s" % (time.time() - t0))
return preproc
def get_preprocessor():
if preproc_str:
if not preproc_cachefile or not os.path.exists(preproc_cachefile):
preproc = train_preprocessor()
if preproc_cachefile:
print("store", preproc_cachefile)
faiss.write_VectorTransform(preproc, preproc_cachefile)
else:
print("load", preproc_cachefile)
preproc = faiss.read_VectorTransform(preproc_cachefile)
else:
d = xb.shape[1]
preproc = IdentPreproc(d)
return preproc
#################################################################
# Prepare the coarse quantizer
#################################################################
def train_coarse_quantizer(x, k, preproc):
d = preproc.d_out
clus = faiss.Clustering(d, k)
clus.verbose = True
# clus.niter = 2
clus.max_points_per_centroid = 10000000
print("apply preproc on shape", x.shape, 'k=', k)
t0 = time.time()
x = preproc.apply_py(sanitize(x))
print(" preproc %.3f s output shape %s" % (
time.time() - t0, x.shape))
vres, vdev = make_vres_vdev()
index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, faiss.IndexFlatL2(d))
clus.train(x, index)
centroids = faiss.vector_float_to_array(clus.centroids)
return centroids.reshape(k, d)
def prepare_coarse_quantizer(preproc):
if cent_cachefile and os.path.exists(cent_cachefile):
print("load centroids", cent_cachefile)
centroids = np.load(cent_cachefile)
else:
nt = max(1000000, 256 * ncent)
print("train coarse quantizer...")
t0 = time.time()
centroids = train_coarse_quantizer(xt[:nt], ncent, preproc)
print("Coarse train time: %.3f s" % (time.time() - t0))
if cent_cachefile:
print("store centroids", cent_cachefile)
np.save(cent_cachefile, centroids)
coarse_quantizer = faiss.IndexFlatL2(preproc.d_out)
coarse_quantizer.add(centroids)
return coarse_quantizer
#################################################################
# Make index and add elements to it
#################################################################
def prepare_trained_index(preproc):
coarse_quantizer = prepare_coarse_quantizer(preproc)
d = preproc.d_out
if pqflat_str == 'Flat':
print("making an IVFFlat index")
idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, ncent,
faiss.METRIC_L2)
else:
m = int(pqflat_str[2:])
assert m < 56 or use_float16, "PQ%d will work only with -float16" % m
print("making an IVFPQ index, m = ", m)
idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, ncent, m, 8)
coarse_quantizer.this.disown()
idx_model.own_fields = True
# finish training on CPU
t0 = time.time()
print("Training vector codes")
x = preproc.apply_py(sanitize(xt[:1000000]))
idx_model.train(x)
print(" done %.3f s" % (time.time() - t0))
return idx_model
def compute_populated_index(preproc):
"""Add elements to a sharded index. Return the index and if available
a sharded gpu_index that contains the same data. """
indexall = prepare_trained_index(preproc)
co = faiss.GpuMultipleClonerOptions()
co.useFloat16 = use_float16
co.useFloat16CoarseQuantizer = False
co.usePrecomputed = use_precomputed_tables
co.indicesOptions = faiss.INDICES_CPU
co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
print("add...")
t0 = time.time()
nb = xb.shape[0]
for i0, xs in dataset_iterator(xb, preproc, add_batch_size):
i1 = i0 + xs.shape[0]
gpu_index.add_with_ids(xs, np.arange(i0, i1))
if max_add > 0 and gpu_index.ntotal > max_add:
print("Flush indexes to CPU")
for i in range(ngpu):
index_src_gpu = faiss.downcast_index(gpu_index.at(i))
index_src = faiss.index_gpu_to_cpu(index_src_gpu)
print(" index %d size %d" % (i, index_src.ntotal))
index_src.copy_subset_to(indexall, 0, 0, nb)
index_src_gpu.reset()
index_src_gpu.reserveMemory(max_add)
gpu_index.sync_with_shard_indexes()
print('\r%d/%d (%.3f s) ' % (
i0, nb, time.time() - t0), end=' ')
sys.stdout.flush()
print("Add time: %.3f s" % (time.time() - t0))
print("Aggregate indexes to CPU")
t0 = time.time()
if hasattr(gpu_index, 'at'):
# it is a sharded index
for i in range(ngpu):
index_src = faiss.index_gpu_to_cpu(gpu_index.at(i))
print(" index %d size %d" % (i, index_src.ntotal))
index_src.copy_subset_to(indexall, 0, 0, nb)
else:
# simple index
index_src = faiss.index_gpu_to_cpu(gpu_index)
index_src.copy_subset_to(indexall, 0, 0, nb)
print(" done in %.3f s" % (time.time() - t0))
if max_add > 0:
# it does not contain all the vectors
gpu_index = None
return gpu_index, indexall
def compute_populated_index_2(preproc):
indexall = prepare_trained_index(preproc)
# set up a 3-stage pipeline that does:
# - stage 1: load + preproc
# - stage 2: assign on GPU
# - stage 3: add to index
stage1 = dataset_iterator(xb, preproc, add_batch_size)
vres, vdev = make_vres_vdev()
coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall.quantizer)
def quantize(args):
(i0, xs) = args
_, assign = coarse_quantizer_gpu.search(xs, 1)
return i0, xs, assign.ravel()
stage2 = rate_limited_imap(quantize, stage1)
print("add...")
t0 = time.time()
nb = xb.shape[0]
for i0, xs, assign in stage2:
i1 = i0 + xs.shape[0]
if indexall.__class__ == faiss.IndexIVFPQ:
indexall.add_core_o(i1 - i0, faiss.swig_ptr(xs),
None, None, faiss.swig_ptr(assign))
elif indexall.__class__ == faiss.IndexIVFFlat:
indexall.add_core(i1 - i0, faiss.swig_ptr(xs), None,
faiss.swig_ptr(assign))
else:
assert False
print('\r%d/%d (%.3f s) ' % (
i0, nb, time.time() - t0), end=' ')
sys.stdout.flush()
print("Add time: %.3f s" % (time.time() - t0))
return None, indexall
def get_populated_index(preproc):
if not index_cachefile or not os.path.exists(index_cachefile):
if not altadd:
gpu_index, indexall = compute_populated_index(preproc)
else:
gpu_index, indexall = compute_populated_index_2(preproc)
if index_cachefile:
print("store", index_cachefile)
faiss.write_index(indexall, index_cachefile)
else:
print("load", index_cachefile)
indexall = faiss.read_index(index_cachefile)
gpu_index = None
co = faiss.GpuMultipleClonerOptions()
co.useFloat16 = use_float16
co.useFloat16CoarseQuantizer = False
co.usePrecomputed = use_precomputed_tables
co.indicesOptions = 0
co.verbose = True
co.shard = True # the replicas will be made "manually"
t0 = time.time()
print("CPU index contains %d vectors, move to GPU" % indexall.ntotal)
if replicas == 1:
if not gpu_index:
print("copying loaded index to GPUs")
vres, vdev = make_vres_vdev()
index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
else:
index = gpu_index
else:
del gpu_index # We override the GPU index
print("Copy CPU index to %d sharded GPU indexes" % replicas)
index = faiss.IndexReplicas()
for i in range(replicas):
gpu0 = ngpu * i / replicas
gpu1 = ngpu * (i + 1) / replicas
vres, vdev = make_vres_vdev(gpu0, gpu1)
print(" dispatch to GPUs %d:%d" % (gpu0, gpu1))
index1 = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
index1.this.disown()
index.addIndex(index1)
index.own_fields = True
del indexall
print("move to GPU done in %.3f s" % (time.time() - t0))
return index
#################################################################
# Perform search
#################################################################
def eval_dataset(index, preproc):
ps = faiss.GpuParameterSpace()
ps.initialize(index)
nq_gt = gt_I.shape[0]
print("search...")
sl = query_batch_size
nq = xq.shape[0]
for nprobe in nprobes:
ps.set_index_parameter(index, 'nprobe', nprobe)
t0 = time.time()
if sl == 0:
D, I = index.search(preproc.apply_py(sanitize(xq)), nnn)
else:
I = np.empty((nq, nnn), dtype='int32')
D = np.empty((nq, nnn), dtype='float32')
inter_res = ''
for i0, xs in dataset_iterator(xq, preproc, sl):
print('\r%d/%d (%.3f s%s) ' % (
i0, nq, time.time() - t0, inter_res), end=' ')
sys.stdout.flush()
i1 = i0 + xs.shape[0]
Di, Ii = index.search(xs, nnn)
I[i0:i1] = Ii
D[i0:i1] = Di
if knngraph and not inter_res and i1 >= nq_gt:
ires = eval_intersection_measure(
gt_I[:, :nnn], I[:nq_gt])
inter_res = ', %.4f' % ires
t1 = time.time()
if knngraph:
ires = eval_intersection_measure(gt_I[:, :nnn], I[:nq_gt])
print(" probe=%-3d: %.3f s rank-%d intersection results: %.4f" % (
nprobe, t1 - t0, nnn, ires))
else:
print(" probe=%-3d: %.3f s" % (nprobe, t1 - t0), end=' ')
gtc = gt_I[:, :1]
nq = xq.shape[0]
for rank in 1, 10, 100:
if rank > nnn: continue
nok = (I[:, :rank] == gtc).sum()
print("1-R@%d: %.4f" % (rank, nok / float(nq)), end=' ')
print()
if I_fname:
I_fname_i = I_fname % I
print("storing", I_fname_i)
np.save(I, I_fname_i)
if D_fname:
D_fname_i = I_fname % I
print("storing", D_fname_i)
np.save(D, D_fname_i)
#################################################################
# Driver
#################################################################
preproc = get_preprocessor()
index = get_populated_index(preproc)
eval_dataset(index, preproc)
# make sure index is deleted before the resources
del index

View File

@@ -0,0 +1,92 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import time
import numpy as np
import pdb
import faiss
from datasets import load_sift1M, evaluate
print("load data")
xb, xq, xt, gt = load_sift1M()
nq, d = xq.shape
# we need only a StandardGpuResources per GPU
res = faiss.StandardGpuResources()
#################################################################
# Exact search experiment
#################################################################
print("============ Exact search")
flat_config = faiss.GpuIndexFlatConfig()
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)
print("add vectors to index")
index.add(xb)
print("warmup")
index.search(xq, 123)
print("benchmark")
for lk in range(11):
k = 1 << lk
t, r = evaluate(index, xq, gt, k)
# the recall should be 1 at all times
print("k=%d %.3f ms, R@1 %.4f" % (k, t, r[1]))
#################################################################
# Approximate search experiment
#################################################################
print("============ Approximate search")
index = faiss.index_factory(d, "IVF4096,PQ64")
# faster, uses more memory
# index = faiss.index_factory(d, "IVF16384,Flat")
co = faiss.GpuClonerOptions()
# here we are using a 64-byte PQ, so we must set the lookup tables to
# 16 bit float (this is due to the limited temporary memory).
co.useFloat16 = True
index = faiss.index_cpu_to_gpu(res, 0, index, co)
print("train")
index.train(xt)
print("add vectors to index")
index.add(xb)
print("warmup")
index.search(xq, 123)
print("benchmark")
for lnprobe in range(10):
nprobe = 1 << lnprobe
index.nprobe
index.nprobe = nprobe
t, r = evaluate(index, xq, gt, 100)
print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))

View File

@@ -0,0 +1,314 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cstdio>
#include <vector>
#include <cinttypes>
#include <faiss/impl/FaissAssert.h>
#include <faiss/utils/hamming.h>
#include <faiss/utils/random.h>
#include <faiss/utils/utils.h>
using namespace faiss;
// These implementations are currently slower than HammingComputerDefault so
// they are not in the main faiss anymore.
struct HammingComputerM8 {
const uint64_t* a;
int n;
HammingComputerM8() = default;
HammingComputerM8(const uint8_t* a8, int code_size) {
set(a8, code_size);
}
void set(const uint8_t* a8, int code_size) {
assert(code_size % 8 == 0);
a = (uint64_t*)a8;
n = code_size / 8;
}
int hamming(const uint8_t* b8) const {
const uint64_t* b = (uint64_t*)b8;
int accu = 0;
for (int i = 0; i < n; i++)
accu += popcount64(a[i] ^ b[i]);
return accu;
}
inline int get_code_size() const {
return n * 8;
}
};
struct HammingComputerM4 {
const uint32_t* a;
int n;
HammingComputerM4() = default;
HammingComputerM4(const uint8_t* a4, int code_size) {
set(a4, code_size);
}
void set(const uint8_t* a4, int code_size) {
assert(code_size % 4 == 0);
a = (uint32_t*)a4;
n = code_size / 4;
}
int hamming(const uint8_t* b8) const {
const uint32_t* b = (uint32_t*)b8;
int accu = 0;
for (int i = 0; i < n; i++)
accu += popcount64(a[i] ^ b[i]);
return accu;
}
inline int get_code_size() const {
return n * 4;
}
};
template <class T>
void hamming_cpt_test(
int code_size,
uint8_t* data1,
uint8_t* data2,
int n,
int* rst) {
T computer(data1, code_size);
for (int i = 0; i < n; i++) {
rst[i] = computer.hamming(data2);
data2 += code_size;
}
}
template <int CODE_SIZE_IN_BITS>
void hamming_func_test(
const uint8_t* const x1,
const uint8_t* const x2,
const size_t n1,
const size_t n2,
uint64_t& sumv,
uint64_t& xorv) {
constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
double t0 = faiss::getmillisecs();
uint64_t sumx = 0;
uint64_t xorx = 0;
const size_t nruns = 10;
for (size_t irun = 0; irun < 10; irun++) {
#pragma omp parallel reduction(+ : sumx, xorx)
{
#pragma omp for
for (size_t i = 0; i < n1; i++) {
uint64_t local_sum = 0;
uint64_t local_xor = 0;
const uint64_t* data1_ptr =
(const uint64_t*)(x1 + i * CODE_SIZE_IN_BYTES);
for (size_t j = 0; j < n2; j++) {
const uint64_t* data2_ptr =
(const uint64_t*)(x2 + j * CODE_SIZE_IN_BYTES);
uint64_t code = faiss::hamming<CODE_SIZE_IN_BITS>(
data1_ptr, data2_ptr);
local_sum += code;
local_xor ^= code;
}
sumx += local_sum;
xorx ^= local_xor;
}
}
}
sumv = sumx;
xorv = xorx;
double t1 = faiss::getmillisecs();
printf("hamming<%d>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
CODE_SIZE_IN_BITS,
(t1 - t0) / nruns,
sumx,
xorx);
}
template <typename HammingComputerT, int CODE_SIZE_IN_BITS>
void hamming_computer_test(
const uint8_t* const x1,
const uint8_t* const x2,
const size_t n1,
const size_t n2,
uint64_t& sumv,
uint64_t& xorv) {
constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
double t0 = faiss::getmillisecs();
uint64_t sumx = 0;
uint64_t xorx = 0;
const size_t nruns = 10;
for (size_t irun = 0; irun < nruns; irun++) {
sumx = 0;
xorx = 0;
#pragma omp parallel reduction(+ : sumx, xorx)
{
#pragma omp for
for (size_t i = 0; i < n1; i++) {
uint64_t local_sum = 0;
uint64_t local_xor = 0;
const uint8_t* data1_ptr = x1 + i * CODE_SIZE_IN_BYTES;
HammingComputerT hc(data1_ptr, CODE_SIZE_IN_BYTES);
for (size_t j = 0; j < n2; j++) {
const uint8_t* data2_ptr = x2 + j * CODE_SIZE_IN_BYTES;
uint64_t code = hc.hamming(data2_ptr);
local_sum += code;
local_xor ^= code;
}
sumx += local_sum;
xorx ^= local_xor;
}
}
}
sumv = sumx;
xorv = xorx;
double t1 = faiss::getmillisecs();
printf("HammingComputer<%zd>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
CODE_SIZE_IN_BYTES,
(t1 - t0) / nruns,
sumx,
xorx);
}
int main() {
size_t n = 4 * 1000 * 1000;
std::vector<size_t> code_size = {128, 256, 512, 1000};
std::vector<uint8_t> x(n * code_size.back());
byte_rand(x.data(), n, 12345);
int nrun = 100;
for (size_t cs : code_size) {
printf("benchmark with code_size=%zd n=%zd nrun=%d\n", cs, n, nrun);
double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
{
std::vector<int> rst_m4(n);
std::vector<int> rst_m8(n);
std::vector<int> rst_default(n);
#pragma omp for
for (int run = 0; run < nrun; run++) {
double t0, t1, t2, t3;
t0 = getmillisecs();
// new implem from Zilliz
hamming_cpt_test<HammingComputerDefault>(
cs, x.data(), x.data(), n, rst_default.data());
t1 = getmillisecs();
// M8
hamming_cpt_test<HammingComputerM8>(
cs, x.data(), x.data(), n, rst_m8.data());
t2 = getmillisecs();
// M4
hamming_cpt_test<HammingComputerM4>(
cs, x.data(), x.data(), n, rst_m4.data());
t3 = getmillisecs();
tot_t1 += t1 - t0;
tot_t2 += t2 - t1;
tot_t3 += t3 - t2;
}
for (int i = 0; i < n; i++) {
FAISS_THROW_IF_NOT_FMT(
(rst_m4[i] == rst_m8[i] && rst_m4[i] == rst_default[i]),
"wrong result i=%d, m4 %d m8 %d default %d",
i,
rst_m4[i],
rst_m8[i],
rst_default[i]);
}
}
printf("Hamming_Dft implem: %.3f ms\n", tot_t1 / nrun);
printf("Hamming_M8 implem: %.3f ms\n", tot_t2 / nrun);
printf("Hamming_M4 implem: %.3f ms\n", tot_t3 / nrun);
}
// evaluate various hamming<>() function calls
const size_t MAX_HAMMING_FUNC_CODE_SIZE = 512;
const size_t n1 = 65536;
const size_t n2 = 16384;
std::vector<uint8_t> x1(n1 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
std::vector<uint8_t> x2(n2 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
byte_rand(x1.data(), x1.size(), 12345);
byte_rand(x2.data(), x2.size(), 23456);
// These two values serve as a kind of CRC.
uint64_t sumx = 0;
uint64_t xorx = 0;
hamming_func_test<64>(x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_func_test<128>(x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_func_test<256>(x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_func_test<384>(x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_func_test<512>(x1.data(), x2.data(), n1, n2, sumx, xorx);
// evaluate various HammingComputerXX
hamming_computer_test<faiss::HammingComputer4, 32>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::HammingComputer8, 64>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::HammingComputer16, 128>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::HammingComputer20, 160>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::HammingComputer32, 256>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::HammingComputer64, 512>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
// evaluate various GenHammingDistanceComputerXX
hamming_computer_test<faiss::GenHammingComputer8, 64>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputer16, 128>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputer32, 256>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputerM8, 64>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputerM8, 128>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputerM8, 256>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
hamming_computer_test<faiss::GenHammingComputerM8, 512>(
x1.data(), x2.data(), n1, n2, sumx, xorx);
return 0;
}

View File

@@ -0,0 +1,29 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import numpy as np
import faiss
if __name__ == "__main__":
faiss.omp_set_num_threads(1)
for d in 4, 8, 16, 13:
nq = 10000
nb = 30000
print('Bits per vector = 8 *', d)
xq = faiss.randint((nq, d // 4), seed=1234, vmax=256**4).view('uint8')
xb = faiss.randint((nb, d // 4), seed=1234, vmax=256**4).view('uint8')
for variant in "hc", "mc":
print(f"{variant=:}", end="\t")
for k in 1, 4, 16, 64, 256:
times = []
for _run in range(5):
t0 = time.time()
D, I = faiss.knn_hamming(xq, xb, k, variant=variant)
t1 = time.time()
times.append(t1 - t0)
print(f'| {k=:} t={np.mean(times):.3f} s ± {np.std(times):.3f} ', flush=True, end="")
print()

View File

@@ -0,0 +1,139 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cstdio>
#include <faiss/impl/FaissAssert.h>
#include <faiss/utils/Heap.h>
#include <faiss/utils/random.h>
#include <faiss/utils/utils.h>
using namespace faiss;
void addn_default(
size_t n,
size_t k,
const float* x,
int64_t* heap_ids,
float* heap_val) {
for (size_t i = 0; i < k; i++) {
minheap_push(i + 1, heap_val, heap_ids, x[i], i);
}
for (size_t i = k; i < n; i++) {
if (x[i] > heap_val[0]) {
minheap_pop(k, heap_val, heap_ids);
minheap_push(k, heap_val, heap_ids, x[i], i);
}
}
minheap_reorder(k, heap_val, heap_ids);
}
void addn_replace(
size_t n,
size_t k,
const float* x,
int64_t* heap_ids,
float* heap_val) {
for (size_t i = 0; i < k; i++) {
minheap_push(i + 1, heap_val, heap_ids, x[i], i);
}
for (size_t i = k; i < n; i++) {
if (x[i] > heap_val[0]) {
minheap_replace_top(k, heap_val, heap_ids, x[i], i);
}
}
minheap_reorder(k, heap_val, heap_ids);
}
void addn_func(
size_t n,
size_t k,
const float* x,
int64_t* heap_ids,
float* heap_val) {
minheap_heapify(k, heap_val, heap_ids);
minheap_addn(k, heap_val, heap_ids, x, nullptr, n);
minheap_reorder(k, heap_val, heap_ids);
}
int main() {
size_t n = 10 * 1000 * 1000;
std::vector<size_t> ks({20, 50, 100, 200, 500, 1000, 2000, 5000});
std::vector<float> x(n);
float_randn(x.data(), n, 12345);
int nrun = 100;
for (size_t k : ks) {
printf("benchmark with k=%zd n=%zd nrun=%d\n", k, n, nrun);
FAISS_THROW_IF_NOT(k < n);
double tot_t1 = 0, tot_t2 = 0, tot_t3 = 0;
#pragma omp parallel reduction(+ : tot_t1, tot_t2, tot_t3)
{
std::vector<float> heap_dis(k);
std::vector<float> heap_dis_2(k);
std::vector<float> heap_dis_3(k);
std::vector<int64_t> heap_ids(k);
std::vector<int64_t> heap_ids_2(k);
std::vector<int64_t> heap_ids_3(k);
#pragma omp for
for (int run = 0; run < nrun; run++) {
double t0, t1, t2, t3;
t0 = getmillisecs();
// default implem
addn_default(n, k, x.data(), heap_ids.data(), heap_dis.data());
t1 = getmillisecs();
// new implem from Zilliz
addn_replace(
n, k, x.data(), heap_ids_2.data(), heap_dis_2.data());
t2 = getmillisecs();
// with addn
addn_func(n, k, x.data(), heap_ids_3.data(), heap_dis_3.data());
t3 = getmillisecs();
tot_t1 += t1 - t0;
tot_t2 += t2 - t1;
tot_t3 += t3 - t2;
}
for (size_t i = 0; i < k; i++) {
FAISS_THROW_IF_NOT_FMT(
heap_ids[i] == heap_ids_2[i],
"i=%ld (%ld, %g) != (%ld, %g)",
i,
size_t(heap_ids[i]),
heap_dis[i],
size_t(heap_ids_2[i]),
heap_dis_2[i]);
FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_2[i]);
}
for (size_t i = 0; i < k; i++) {
FAISS_THROW_IF_NOT(heap_ids[i] == heap_ids_3[i]);
FAISS_THROW_IF_NOT(heap_dis[i] == heap_dis_3[i]);
}
}
printf("default implem: %.3f ms\n", tot_t1 / nrun);
printf("replace implem: %.3f ms\n", tot_t2 / nrun);
printf("addn implem: %.3f ms\n", tot_t3 / nrun);
}
return 0;
}

View File

@@ -0,0 +1,192 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import sys
import numpy as np
import faiss
try:
from faiss.contrib.datasets_fb import DatasetSIFT1M
except ImportError:
from faiss.contrib.datasets import DatasetSIFT1M
# from datasets import load_sift1M
k = int(sys.argv[1])
todo = sys.argv[2:]
print("load data")
# xb, xq, xt, gt = load_sift1M()
ds = DatasetSIFT1M()
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train()
nq, d = xq.shape
if todo == []:
todo = 'hnsw hnsw_sq ivf ivf_hnsw_quantizer kmeans kmeans_hnsw nsg'.split()
def evaluate(index):
# for timing with a single core
# faiss.omp_set_num_threads(1)
t0 = time.time()
D, I = index.search(xq, k)
t1 = time.time()
missing_rate = (I == -1).sum() / float(k * nq)
recall_at_1 = (I == gt[:, :1]).sum() / float(nq)
print("\t %7.3f ms per query, R@1 %.4f, missing rate %.4f" % (
(t1 - t0) * 1000.0 / nq, recall_at_1, missing_rate))
if 'hnsw' in todo:
print("Testing HNSW Flat")
index = faiss.IndexHNSWFlat(d, 32)
# training is not needed
# this is the default, higher is more accurate and slower to
# construct
index.hnsw.efConstruction = 40
print("add")
# to see progress
index.verbose = True
index.add(xb)
print("search")
for efSearch in 16, 32, 64, 128, 256:
for bounded_queue in [True, False]:
print("efSearch", efSearch, "bounded queue", bounded_queue, end=' ')
index.hnsw.search_bounded_queue = bounded_queue
index.hnsw.efSearch = efSearch
evaluate(index)
if 'hnsw_sq' in todo:
print("Testing HNSW with a scalar quantizer")
# also set M so that the vectors and links both use 128 bytes per
# entry (total 256 bytes)
index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)
print("training")
# training for the scalar quantizer
index.train(xt)
# this is the default, higher is more accurate and slower to
# construct
index.hnsw.efConstruction = 40
print("add")
# to see progress
index.verbose = True
index.add(xb)
print("search")
for efSearch in 16, 32, 64, 128, 256:
print("efSearch", efSearch, end=' ')
index.hnsw.efSearch = efSearch
evaluate(index)
if 'ivf' in todo:
print("Testing IVF Flat (baseline)")
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, 16384)
index.cp.min_points_per_centroid = 5 # quiet warning
# to see progress
index.verbose = True
print("training")
index.train(xt)
print("add")
index.add(xb)
print("search")
for nprobe in 1, 4, 16, 64, 256:
print("nprobe", nprobe, end=' ')
index.nprobe = nprobe
evaluate(index)
if 'ivf_hnsw_quantizer' in todo:
print("Testing IVF Flat with HNSW quantizer")
quantizer = faiss.IndexHNSWFlat(d, 32)
index = faiss.IndexIVFFlat(quantizer, d, 16384)
index.cp.min_points_per_centroid = 5 # quiet warning
index.quantizer_trains_alone = 2
# to see progress
index.verbose = True
print("training")
index.train(xt)
print("add")
index.add(xb)
print("search")
quantizer.hnsw.efSearch = 64
for nprobe in 1, 4, 16, 64, 256:
print("nprobe", nprobe, end=' ')
index.nprobe = nprobe
evaluate(index)
# Bonus: 2 kmeans tests
if 'kmeans' in todo:
print("Performing kmeans on sift1M database vectors (baseline)")
clus = faiss.Clustering(d, 16384)
clus.verbose = True
clus.niter = 10
index = faiss.IndexFlatL2(d)
clus.train(xb, index)
if 'kmeans_hnsw' in todo:
print("Performing kmeans on sift1M using HNSW assignment")
clus = faiss.Clustering(d, 16384)
clus.verbose = True
clus.niter = 10
index = faiss.IndexHNSWFlat(d, 32)
# increase the default efSearch, otherwise the number of empty
# clusters is too high.
index.hnsw.efSearch = 128
clus.train(xb, index)
if 'nsg' in todo:
print("Testing NSG Flat")
index = faiss.IndexNSGFlat(d, 32)
index.build_type = 1
# training is not needed
# this is the default, higher is more accurate and slower to
# construct
print("add")
# to see progress
index.verbose = True
index.add(xb)
print("search")
for search_L in -1, 16, 32, 64, 128, 256:
print("search_L", search_L, end=' ')
index.nsg.search_L = search_L
evaluate(index)

View File

@@ -0,0 +1,599 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import os
import pickle
import time
from multiprocessing.pool import ThreadPool
import faiss
import numpy as np
try:
from faiss.contrib.datasets_fb import dataset_from_name
except ImportError:
from faiss.contrib.datasets import dataset_from_name
from faiss.contrib.evaluation import OperatingPointsWithRanges
from faiss.contrib.ivf_tools import replace_ivf_quantizer
#################################################################
# Preassigned search functions
#################################################################
def search_preassigned(xq, k, index, quantizer, batch_size=0):
"""
Explicitly call the coarse quantizer and the search_preassigned
on the index.
"""
n, d = xq.shape
nprobe = index.nprobe
if batch_size == 0:
batch_size = n + 1
D = np.empty((n, k), dtype='float32')
I = np.empty((n, k), dtype='int64')
for i0 in range(0, n, batch_size):
Dq, Iq = quantizer.search(xq[i0:i0 + batch_size], nprobe)
D[i0:i0 + batch_size], I[i0:i0 + batch_size] = \
index.search_preassigned(xq[i0:i0 + batch_size], k, Iq, Dq)
return D, I
def tiled_search_preassigned(xq, k, index, quantizer, batch_size=32768):
"""
Explicitly call the coarse quantizer and the search_preassigned
on the index. Allow overlapping between coarse quantization and
scanning the inverted lists.
"""
n, d = xq.shape
# prepare a thread that will run the quantizer
qq_pool = ThreadPool(1)
nprobe = index.nprobe
def coarse_quant(i0):
if i0 >= n:
return None
i1 = min(i0 + batch_size, n)
return quantizer.search(xq[i0:i1], nprobe)
D = np.empty((n, k), dtype='float32')
I = np.empty((n, k), dtype='int64')
qq = coarse_quant(0)
for i0 in range(0, n, batch_size):
i1 = min(i0 + batch_size, n)
qq_next = qq_pool.apply_async(coarse_quant, (i0 + batch_size, ))
Dq, Iq = qq
index.search_preassigned(
xq[i0:i1], k, Iq=Iq, Dq=Dq, I=I[i0:i1], D=D[i0:i1])
qq = qq_next.get()
qq_pool.close()
return D, I
#################################################################
# IVF index objects with a separate coarse quantizer
#################################################################
class SeparateCoarseQuantizationIndex:
"""
Separately manage the coarse quantizer and the IVF index.
"""
def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
self.index = index
self.index_ivf = extract_index_ivf(index)
if isinstance(self.index_ivf, faiss.IndexIVF):
self.index_ivf.parallel_mode
self.index_ivf.parallel_mode = 3
self.quantizer = quantizer
assert self.quantizer.d == self.index_ivf.d
# populate quantizer if it was not done before
if quantizer.ntotal > 0:
assert quantizer.ntotal == self.index_ivf.nlist
else:
centroids = self.index_ivf.quantizer.reconstruct_n()
print(f"adding centroids size {centroids.shape} to quantizer")
quantizer.train(centroids)
quantizer.add(centroids)
self.bs = bs
self.seq_tiling = seq_tiling
def search(self, xq, k):
# perform coarse quantization
if isinstance(self.index, faiss.IndexPreTransform):
# print("applying pre-transform")
assert self.index.chain.size() == 1
xq = self.index.chain.at(0).apply(xq)
if self.bs <= 0:
# non batched
nprobe = self.index_ivf.nprobe
Dq, Iq = self.quantizer.search(xq, nprobe)
return self.index_ivf.search_preassigned(xq, k, Iq, Dq)
if self.seq_tiling:
return search_preassigned(
xq, k, self.index_ivf, self.quantizer, self.bs)
else:
return tiled_search_preassigned(
xq, k, self.index_ivf, self.quantizer, self.bs)
class ShardedGPUIndex:
"""
Multiple GPU indexes, each on its GPU, with a common coarse quantizer.
The Python version of IndexShardsIVF
"""
def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
self.quantizer = quantizer
self.cpu_index = index
if isinstance(index, faiss.IndexPreTransform):
index = faiss.downcast_index(index.index)
ngpu = index.count()
self.pool = ThreadPool(ngpu)
self.bs = bs
if bs > 0:
self.q_pool = ThreadPool(1)
def __del__(self):
self.pool.close()
if self.bs > 0:
self.q_pool.close()
def search(self, xq, k):
nq = len(xq)
# perform coarse quantization
index = self.cpu_index
if isinstance(self.cpu_index, faiss.IndexPreTransform):
assert index.chain.size() == 1
xq = self.cpu_index.chain.at(0).apply(xq)
index = faiss.downcast_index(index.index)
ngpu = index.count()
sub_index_0 = faiss.downcast_index(index.at(0))
nprobe = sub_index_0.nprobe
Dall = np.empty((ngpu, nq, k), dtype='float32')
Iall = np.empty((ngpu, nq, k), dtype='int64')
bs = self.bs
if bs <= 0:
Dq, Iq = self.quantizer.search(xq, nprobe)
def do_search(rank):
gpu_index = faiss.downcast_index(index.at(rank))
Dall[rank], Iall[rank] = gpu_index.search_preassigned(
xq, k, Iq, Dq)
list(self.pool.map(do_search, range(ngpu)))
else:
qq_pool = self.q_pool
bs = self.bs
def coarse_quant(i0):
if i0 >= nq:
return None
return self.quantizer.search(xq[i0:i0 + bs], nprobe)
def do_search(rank, i0, qq):
gpu_index = faiss.downcast_index(index.at(rank))
Dq, Iq = qq
Dall[rank, i0:i0 + bs], Iall[rank, i0:i0 + bs] = \
gpu_index.search_preassigned(xq[i0:i0 + bs], k, Iq, Dq)
qq = coarse_quant(0)
for i0 in range(0, nq, bs):
qq_next = qq_pool.apply_async(coarse_quant, (i0 + bs, ))
list(self.pool.map(
lambda rank: do_search(rank, i0, qq),
range(ngpu)
))
qq = qq_next.get()
return faiss.merge_knn_results(Dall, Iall)
def extract_index_ivf(index):
""" extract the IVF sub-index from the index, supporting GpuIndexes
as well """
try:
return faiss.extract_index_ivf(index)
except RuntimeError:
if index.__class__ == faiss.IndexPreTransform:
index = faiss.downcast_index(index.index)
if isinstance(index, faiss.GpuIndexIVF):
return index
raise RuntimeError(f"could not extract IVF index from {index}")
def set_index_parameter(index, name, val):
"""
Index parameter setting that works on the index lookalikes defined above
"""
if index.__class__ == SeparateCoarseQuantizationIndex:
if name == "nprobe":
set_index_parameter(index.index_ivf, name, val)
elif name.startswith("quantizer_"):
set_index_parameter(
index.quantizer, name[name.find("_") + 1:], val)
else:
raise RuntimeError()
return
if index.__class__ == ShardedGPUIndex:
if name == "nprobe":
set_index_parameter(index.cpu_index, name, val)
elif name.startswith("quantizer_"):
set_index_parameter(
index.quantizer, name[name.find("_") + 1:], val)
else:
raise RuntimeError()
return
# then it's a Faiss index
index = faiss.downcast_index(index)
if isinstance(index, faiss.IndexPreTransform):
set_index_parameter(index.index, name, val)
elif isinstance(index, faiss.IndexShardsIVF):
if name != "nprobe" and name.startswith("quantizer_"):
set_index_parameter(
index.quantizer, name[name.find("_") + 1:], val)
else:
for i in range(index.count()):
sub_index = index.at(i)
set_index_parameter(sub_index, name, val)
elif (isinstance(index, faiss.IndexShards) or
isinstance(index, faiss.IndexReplicas)):
for i in range(index.count()):
sub_index = index.at(i)
set_index_parameter(sub_index, name, val)
elif name.startswith("quantizer_"):
index_ivf = extract_index_ivf(index)
set_index_parameter(
index_ivf.quantizer, name[name.find("_") + 1:], val)
elif name == "efSearch":
index.hnsw.efSearch
index.hnsw.efSearch = int(val)
elif name == "nprobe":
index_ivf = extract_index_ivf(index)
index_ivf.nprobe
index_ivf.nprobe = int(val)
else:
raise RuntimeError(f"could not set param {name} on {index}")
#####################################################################
# Driver routine
#####################################################################
def main():
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('dataset options')
aa('--nq', type=int, default=int(10e5),
help="nb queries (queries will be duplicated if below that number")
aa('--db', default='bigann10M', help='dataset')
group = parser.add_argument_group('index options')
aa('--indexname', default="", help="override index name")
aa('--mmap', default=False, action='store_true', help='mmap index')
aa('--shard_type', default=1, type=int, help="set type of sharding")
aa('--useFloat16', default=False, action='store_true',
help='GPU cloner options')
aa('--useFloat16CoarseQuantizer', default=False, action='store_true',
help='GPU cloner options')
aa('--usePrecomputed', default=False, action='store_true',
help='GPU cloner options')
group = parser.add_argument_group('search options')
aa('--k', type=int, default=100)
aa('--search_type', default="cpu",
choices=[
"cpu", "gpu", "gpu_flat_quantizer",
"cpu_flat_gpu_quantizer", "gpu_tiled", "gpu_ivf_quantizer",
"multi_gpu", "multi_gpu_flat_quantizer",
"multi_gpu_sharded", "multi_gpu_flat_quantizer_sharded",
"multi_gpu_sharded1", "multi_gpu_sharded1_flat",
"multi_gpu_sharded1_ivf",
"multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
"multi_gpu_Csharded1_ivf",
],
help="how to search"
)
aa('--ivf_quant_nlist', type=int, default=1024,
help="nb of invlists for IVF quantizer")
aa('--batch_size', type=int, default=-1,
help="batch size for tiled CPU / GPU computation (-1= no tiling)")
aa('--n_autotune', type=int, default=300,
help="max nb of auto-tuning steps")
aa('--nt', type=int, default=-1, help="force number of CPU threads to this")
group = parser.add_argument_group('output options')
aa('--quiet', default=False, action="store_true")
aa('--stats', default="", help="pickle to store output stats")
args = parser.parse_args()
print("args:", args)
if not args.quiet:
# log some stats about the machine
os.system("grep -m1 'model name' < /proc/cpuinfo")
os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
os.system("nvidia-smi")
print("prepare dataset", args.db)
ds = dataset_from_name(args.db)
print(ds)
print("Faiss nb GPUs:", faiss.get_num_gpus())
xq = ds.get_queries()
if args.nq > len(xq):
xqx = []
n = 0
while n < args.nq:
xqx.append(xq[:args.nq - n])
n += len(xqx[-1])
print(f"increased nb queries from {len(xq)} to {n}")
xq = np.vstack(xqx)
if args.nt != -1:
print("setting nb openmp threads to", args.nt)
faiss.omp_set_num_threads(args.nt)
print("loading index")
if args.mmap:
io_flag = faiss.IO_FLAG_READ_ONLY | faiss.IO_FLAG_MMAP
else:
io_flag = 0
print(f"load index {args.indexname} {io_flag=:x}")
index = faiss.read_index(args.indexname, io_flag)
index_ivf = faiss.extract_index_ivf(index)
print("prepare index")
op = OperatingPointsWithRanges()
op.add_range(
"nprobe", [
2 ** i for i in range(20)
if 2 ** i < index_ivf.nlist * 0.1 and 2 ** i <= 4096
]
)
# prepare options for GPU clone
co = faiss.GpuMultipleClonerOptions()
co.useFloat16 = args.useFloat16
co.useFloat16CoarseQuantizer = args.useFloat16CoarseQuantizer
co.usePrecomputed = args.usePrecomputed
co.shard_type = args.shard_type
if args.search_type == "cpu":
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
elif args.search_type == "gpu":
print("move index to 1 GPU")
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index, co)
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
op.restrict_range("nprobe", 2049)
elif args.search_type == "gpu_tiled":
print("move index to 1 GPU")
new_quantizer = faiss.IndexFlatL2(index_ivf.d)
quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index, co)
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
op.restrict_range("nprobe", 2049)
index = SeparateCoarseQuantizationIndex(
quantizer_hnsw, index, bs=args.batch_size)
elif args.search_type == "gpu_ivf_quantizer":
index_ivf = faiss.extract_index_ivf(index)
centroids = index_ivf.quantizer.reconstruct_n()
replace_ivf_quantizer(index_ivf, faiss.IndexFlatL2(index_ivf.d))
res = faiss.StandardGpuResources()
new_quantizer = faiss.index_factory(
index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
new_quantizer.train(centroids)
new_quantizer.add(centroids)
index = SeparateCoarseQuantizationIndex(
faiss.index_cpu_to_gpu(res, 0, new_quantizer, co),
faiss.index_cpu_to_gpu(res, 0, index, co),
bs=args.batch_size, seq_tiling=True
)
op.add_range(
"quantizer_nprobe",
[2 ** i for i in range(9)]
)
op.restrict_range("nprobe", 1025)
elif args.search_type == "gpu_flat_quantizer":
index_ivf = faiss.extract_index_ivf(index)
new_quantizer = faiss.IndexFlatL2(index_ivf.d)
replace_ivf_quantizer(index_ivf, new_quantizer)
res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index, co)
op.restrict_range("nprobe", 2049)
elif args.search_type == "cpu_flat_gpu_quantizer":
index_ivf = faiss.extract_index_ivf(index)
quantizer = faiss.IndexFlatL2(index_ivf.d)
res = faiss.StandardGpuResources()
quantizer = faiss.index_cpu_to_gpu(res, 0, quantizer, co)
index = SeparateCoarseQuantizationIndex(
quantizer, index, bs=args.batch_size)
op.restrict_range("nprobe", 2049)
elif args.search_type in ("multi_gpu", "multi_gpu_sharded"):
print(f"move index to {faiss.get_num_gpus()} GPU")
co.shard = "sharded" in args.search_type
index = faiss.index_cpu_to_all_gpus(index, co=co)
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
op.restrict_range("nprobe", 2049)
elif args.search_type in (
"multi_gpu_flat_quantizer", "multi_gpu_flat_quantizer_sharded"):
index_ivf = faiss.extract_index_ivf(index)
new_quantizer = faiss.IndexFlatL2(ds.d)
replace_ivf_quantizer(index_ivf, new_quantizer)
index = faiss.index_cpu_to_all_gpus(index, co=co)
op.restrict_range("nprobe", 2049)
elif args.search_type in (
"multi_gpu_sharded1", "multi_gpu_sharded1_flat",
"multi_gpu_sharded1_ivf"):
print(f"move index to {faiss.get_num_gpus()} GPU")
new_quantizer = faiss.IndexFlatL2(index_ivf.d)
hnsw_quantizer = replace_ivf_quantizer(index_ivf, new_quantizer)
co.shard
co.shard = True
gpus = list(range(faiss.get_num_gpus()))
res = [faiss.StandardGpuResources() for _ in gpus]
index = faiss.index_cpu_to_gpu_multiple_py(res, index, co, gpus)
op.restrict_range("nprobe", 2049)
if args.search_type == "multi_gpu_sharded1":
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
index = ShardedGPUIndex(hnsw_quantizer, index, bs=args.batch_size)
elif args.search_type == "multi_gpu_sharded1_ivf":
centroids = hnsw_quantizer.storage.reconstruct_n()
quantizer = faiss.index_factory(
centroids.shape[1], f"IVF{args.ivf_quant_nlist},Flat")
quantizer.train(centroids)
quantizer.add(centroids)
co.shard = False
quantizer = faiss.index_cpu_to_gpu_multiple_py(
res, quantizer, co, gpus)
index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
op.add_range(
"quantizer_nprobe",
[2 ** i for i in range(9)]
)
op.restrict_range("nprobe", 1025)
elif args.search_type == "multi_gpu_sharded1_flat":
quantizer = hnsw_quantizer.storage
quantizer = faiss.index_cpu_to_gpu_multiple_py(
res, quantizer, co, gpus)
index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
else:
raise RuntimeError()
elif args.search_type in (
"multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
"multi_gpu_Csharded1_ivf"):
print(f"move index to {faiss.get_num_gpus()} GPU")
co.shard = True
co.common_ivf_quantizer
co.common_ivf_quantizer = True
op.restrict_range("nprobe", 2049)
if args.search_type == "multi_gpu_Csharded1":
op.add_range(
"quantizer_efSearch",
[2 ** i for i in range(10)]
)
index = faiss.index_cpu_to_all_gpus(index, co)
elif args.search_type == "multi_gpu_Csharded1_flat":
new_quantizer = faiss.IndexFlatL2(index_ivf.d)
quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
index = faiss.index_cpu_to_all_gpus(index, co)
elif args.search_type == "multi_gpu_Csharded1_ivf":
quantizer = faiss.index_factory(
index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
quantizer_hnsw = replace_ivf_quantizer(index_ivf, quantizer)
op.add_range(
"quantizer_nprobe",
[2 ** i for i in range(9)]
)
index = faiss.index_cpu_to_all_gpus(index, co)
else:
raise RuntimeError()
else:
raise RuntimeError()
totex = op.num_experiments()
experiments = op.sample_experiments()
print(f"total nb experiments {totex}, running {len(experiments)}")
print("perform search")
gt = ds.get_groundtruth(100)
# piggyback on operating points so that this gets stored in the stats file
op.all_experiments = []
op.platform = {
"loadavg": open("/proc/loadavg", "r").readlines(),
"procesor": [l for l in open("/proc/cpuinfo") if "model name" in l][0],
"GPU": list(os.popen("nvidia-smi", "r")),
"mem": open("/proc/meminfo", "r").readlines(),
"pid": os.getpid()
}
op.args = args
if args.stats:
print(f"storing stats in {args.stats} after each experiment")
for cno in experiments:
key = op.cno_to_key(cno)
parameters = op.get_parameters(key)
print(f"{cno=:4d} {str(parameters):50}", end=": ", flush=True)
(max_perf, min_time) = op.predict_bounds(key)
if not op.is_pareto_optimal(max_perf, min_time):
print(f"SKIP, {max_perf=:.3f} {min_time=:.3f}", )
continue
for name, val in parameters.items():
set_index_parameter(index, name, val)
if cno == 0:
# warmup
for _ in range(5):
D, I = index.search(xq, 100)
t0 = time.time()
try:
D, I = index.search(xq, 100)
except RuntimeError as e:
print(f"ERROR {e}")
continue
t1 = time.time()
recalls = {}
for rank in 1, 10, 100:
recall = (gt[:, :1] == I[:ds.nq, :rank]).sum() / ds.nq
recalls[rank] = recall
print(f"time={t1 - t0:.3f} s recalls={recalls}")
perf = recalls[1]
op.add_operating_point(key, perf, t1 - t0)
op.all_experiments.append({
"cno": cno,
"key": key,
"parameters": parameters,
"time": t1 - t0,
"recalls": recalls
})
if args.stats:
pickle.dump(op, open(args.stats, "wb"))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,86 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import os
import numpy as np
import faiss
from faiss.contrib.datasets import SyntheticDataset
os.system("grep -m1 'model name' < /proc/cpuinfo")
def format_tab(x):
return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
faiss.cvar.distance_compute_min_k_reservoir = 5
# for have_threads in True, False:
for have_threads in False, :
if have_threads:
# good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
nthread = 32
else:
nthread = 1
faiss.omp_set_num_threads(nthread)
print("************ nthread=", nthread)
for nq in 100, 10000:
print("*********** nq=", nq)
if nq == 100:
nrun = 500
unit = "ms"
else:
nrun = 20
unit = "s"
restab = []
for d in 16, 32, 64, 128:
print("========== d=", d)
nb = 10000
# d = 32
ds = SyntheticDataset(d, 0, nb, nq)
print(ds)
index = faiss.IndexFlatL2(d)
index.add(ds.get_database())
nrun = 10
restab1 = []
restab.append(restab1)
for k in 1, 10, 100:
times = []
for run in range(nrun):
t0 = time.time()
index.search(ds.get_queries(), k)
t1 = time.time()
if run >= nrun // 5: # the rest is considered warmup
times.append((t1 - t0))
times = np.array(times)
if unit == "ms":
times *= 1000
print("search k=%3d t=%.3f ms (± %.4f)" % (
k, np.mean(times), np.std(times)))
else:
print("search k=%3d t=%.3f s (± %.4f)" % (
k, np.mean(times), np.std(times)))
restab1.append(np.mean(times))
print("restab=\n", format_tab(restab))

View File

@@ -0,0 +1,22 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import faiss
from datasets import load_sift1M, evaluate
xb, xq, xt, gt = load_sift1M()
nq, d = xq.shape
k = 32
for nbits in 4, 6, 8, 10, 12:
index = faiss.IndexPQ(d, 8, nbits)
index.train(xt)
index.add(xb)
t, r = evaluate(index, xq, gt, k)
print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
del index

View File

@@ -0,0 +1,112 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import faiss
import time
import os
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt
try:
from faiss.contrib.datasets_fb import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
except ImportError:
from faiss.contrib.datasets import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
# ds = DatasetDeep1B(10**6)
# ds = DatasetBigANN(nb_M=1)
ds = DatasetSIFT1M()
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train()
nb, d = xb.shape
nq, d = xq.shape
nt, d = xt.shape
k = 1
AQ = faiss.AdditiveQuantizer
def eval_recall(index, name):
t0 = time.time()
D, I = index.search(xq, k=k)
t = time.time() - t0
speed = t * 1000 / nq
qps = 1000 / speed
corrects = (gt == I).sum()
recall = corrects / nq
print(
f'\tnprobe {index.nprobe:3d}, Recall@{k}: '
f'{recall:.6f}, speed: {speed:.6f} ms/query'
)
return recall, qps
def eval_and_plot(name, rescale_norm=True, plot=True):
index = faiss.index_factory(d, name)
index_path = f"indices/{name}.faissindex"
if os.path.exists(index_path):
index = faiss.read_index(index_path)
else:
faiss.omp_set_num_threads(mp.cpu_count())
index.train(xt)
index.add(xb)
faiss.write_index(index, index_path)
# search params
if hasattr(index, 'rescale_norm'):
index.rescale_norm = rescale_norm
name += f"(rescale_norm={rescale_norm})"
faiss.omp_set_num_threads(1)
data = []
print(f"======{name}")
for nprobe in 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 128:
index.nprobe = nprobe
recall, qps = eval_recall(index, name)
data.append((recall, qps))
if plot:
data = np.array(data)
plt.plot(data[:, 0], data[:, 1], label=name) # x - recall, y - qps
M, nlist = 32, 1024
# just for warmup...
# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
# benchmark
plt.figure(figsize=(8, 6), dpi=80)
# PQ
eval_and_plot(f"IVF{nlist},PQ{M}x4fs")
eval_and_plot(f"IVF{nlist},PQ{M}x4fsr")
# AQ, by_residual
eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4")
eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4")
eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4", rescale_norm=False)
eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4", rescale_norm=False)
# AQ, no by_residual
eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fs_Nlsq2x4")
eval_and_plot(f"IVF{nlist},RQ{M-2}x4fs_Nrq2x4")
plt.title("Indices on SIFT1M")
plt.xlabel("Recall@1")
plt.ylabel("QPS")
plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')

View File

@@ -0,0 +1,122 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import faiss
import time
import os
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt
try:
from faiss.contrib.datasets_fb import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
except ImportError:
from faiss.contrib.datasets import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
# ds = DatasetDeep1B(10**6)
ds = DatasetBigANN(nb_M=50)
# ds = DatasetSIFT1M()
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train()
nb, d = xb.shape
nq, d = xq.shape
nt, d = xt.shape
print('the dimension is {}, {}'.format(nb, d))
k = 64
def eval_recall(index, name, single_query=False):
t0 = time.time()
D, I = index.search(xq, k=k)
t = time.time() - t0
if single_query:
t0 = time.time()
for row in range(nq):
Ds, Is = index.search(xq[row:row + 1], k=k)
D[row, :] = Ds
I[row, :] = Is
t = time.time() - t0
speed = t * 1000 / nq
qps = 1000 / speed
corrects = (gt[:, :1] == I[:, :k]).sum()
recall = corrects / nq
print(
f'\tnprobe {index.nprobe:3d}, 1Recall@{k}: '
f'{recall:.6f}, speed: {speed:.6f} ms/query'
)
return recall, qps
def eval_and_plot(
name, rescale_norm=True, plot=True, single_query=False,
implem=None, num_threads=1):
index = faiss.index_factory(d, name)
index_path = f"indices/{name}.faissindex"
if os.path.exists(index_path):
index = faiss.read_index(index_path)
else:
faiss.omp_set_num_threads(mp.cpu_count())
index.train(xt)
index.add(xb)
faiss.write_index(index, index_path)
# search params
if hasattr(index, 'rescale_norm'):
index.rescale_norm = rescale_norm
name += f"(rescale_norm={rescale_norm})"
if implem is not None and hasattr(index, 'implem'):
index.implem = implem
name += f"(implem={implem})"
if single_query:
name += f"(single_query={single_query})"
if num_threads > 1:
name += f"(num_threads={num_threads})"
faiss.omp_set_num_threads(num_threads)
data = []
print(f"======{name}")
for nprobe in 1, 4, 8, 16, 32, 64, 128, 256:
index.nprobe = nprobe
recall, qps = eval_recall(index, name, single_query=single_query)
data.append((recall, qps))
if plot:
data = np.array(data)
plt.plot(data[:, 0], data[:, 1], label=name) # x - recall, y - qps
M, nlist = 64, 4096
# just for warmup...
# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
# benchmark
plt.figure(figsize=(8, 6), dpi=80)
eval_and_plot(f"IVF{nlist},PQ{M}x4fs", num_threads=8)
eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=0, num_threads=8)
eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=14, num_threads=8)
eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=15, num_threads=8)
plt.title("Indices on Bigann50M")
plt.xlabel("1Recall@{}".format(k))
plt.ylabel("QPS")
plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')

View File

@@ -0,0 +1,145 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <omp.h>
#include <unistd.h>
#include <memory>
#include <faiss/IVFlib.h>
#include <faiss/IndexIVF.h>
#include <faiss/impl/IDSelector.h>
#include <faiss/index_factory.h>
#include <faiss/index_io.h>
#include <faiss/utils/random.h>
#include <faiss/utils/utils.h>
/************************
* This benchmark attempts to measure the runtime overhead to use an IDSelector
* over doing an unconditional sequential scan. Unfortunately the results of the
* benchmark also depend a lot on the parallel_mode and the way
* search_with_parameters works.
*/
int main() {
using idx_t = faiss::idx_t;
int d = 64;
size_t nb = 1024 * 1024;
size_t nq = 512 * 16;
size_t k = 10;
std::vector<float> data((nb + nq) * d);
float* xb = data.data();
float* xq = data.data() + nb * d;
faiss::rand_smooth_vectors(nb + nq, d, data.data(), 1234);
std::unique_ptr<faiss::Index> index;
// const char *index_key = "IVF1024,Flat";
const char* index_key = "IVF1024,SQ8";
printf("index_key=%s\n", index_key);
std::string stored_name =
std::string("/tmp/bench_ivf_selector_") + index_key + ".faissindex";
if (access(stored_name.c_str(), F_OK) != 0) {
printf("creating index\n");
index.reset(faiss::index_factory(d, index_key));
double t0 = faiss::getmillisecs();
index->train(nb, xb);
double t1 = faiss::getmillisecs();
index->add(nb, xb);
double t2 = faiss::getmillisecs();
printf("Write %s\n", stored_name.c_str());
faiss::write_index(index.get(), stored_name.c_str());
} else {
printf("Read %s\n", stored_name.c_str());
index.reset(faiss::read_index(stored_name.c_str()));
}
faiss::IndexIVF* index_ivf = static_cast<faiss::IndexIVF*>(index.get());
index->verbose = true;
for (int tt = 0; tt < 3; tt++) {
if (tt == 1) {
index_ivf->parallel_mode = 3;
} else {
index_ivf->parallel_mode = 0;
}
if (tt == 2) {
printf("set single thread\n");
omp_set_num_threads(1);
}
printf("parallel_mode=%d\n", index_ivf->parallel_mode);
std::vector<float> D1(nq * k);
std::vector<idx_t> I1(nq * k);
{
double t2 = faiss::getmillisecs();
index->search(nq, xq, k, D1.data(), I1.data());
double t3 = faiss::getmillisecs();
printf("search time, no selector: %.3f ms\n", t3 - t2);
}
std::vector<float> D2(nq * k);
std::vector<idx_t> I2(nq * k);
{
double t2 = faiss::getmillisecs();
faiss::IVFSearchParameters params;
faiss::ivflib::search_with_parameters(
index.get(), nq, xq, k, D2.data(), I2.data(), &params);
double t3 = faiss::getmillisecs();
printf("search time with nullptr selector: %.3f ms\n", t3 - t2);
}
FAISS_THROW_IF_NOT(I1 == I2);
FAISS_THROW_IF_NOT(D1 == D2);
{
double t2 = faiss::getmillisecs();
faiss::IVFSearchParameters params;
faiss::IDSelectorAll sel;
params.sel = &sel;
faiss::ivflib::search_with_parameters(
index.get(), nq, xq, k, D2.data(), I2.data(), &params);
double t3 = faiss::getmillisecs();
printf("search time with selector: %.3f ms\n", t3 - t2);
}
FAISS_THROW_IF_NOT(I1 == I2);
FAISS_THROW_IF_NOT(D1 == D2);
std::vector<float> D3(nq * k);
std::vector<idx_t> I3(nq * k);
{
int nt = omp_get_max_threads();
double t2 = faiss::getmillisecs();
faiss::IVFSearchParameters params;
#pragma omp parallel for if (nt > 1)
for (idx_t slice = 0; slice < nt; slice++) {
idx_t i0 = nq * slice / nt;
idx_t i1 = nq * (slice + 1) / nt;
if (i1 > i0) {
faiss::ivflib::search_with_parameters(
index.get(),
i1 - i0,
xq + i0 * d,
k,
D3.data() + i0 * k,
I3.data() + i0 * k,
&params);
}
}
double t3 = faiss::getmillisecs();
printf("search time with null selector + manual parallel: %.3f ms\n",
t3 - t2);
}
FAISS_THROW_IF_NOT(I1 == I3);
FAISS_THROW_IF_NOT(D1 == D3);
}
return 0;
}

View File

@@ -0,0 +1,167 @@
# @lint-ignore-every LICENSELINT
# Copyright (c) Meta Platforms, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import faiss
import time
import argparse
import rmm
try:
from faiss.contrib.datasets_fb import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
except ImportError:
from faiss.contrib.datasets import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
# ds = DatasetDeep1B(10**6)
# ds = DatasetBigANN(nb_M=1)
ds = DatasetSIFT1M()
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train()
nb, d = xb.shape
nq, d = xq.shape
nt, d = xt.shape
######################################################
# Command-line parsing
######################################################
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('benchmarking options')
aa('--bm_train', default=True,
help='whether to benchmark train operation on GPU index')
aa('--bm_add', default=True,
help='whether to benchmark add operation on GPU index')
aa('--bm_search', default=True,
help='whether to benchmark search operation on GPU index')
group = parser.add_argument_group('IVF options')
aa('--nlist', default=1024, type=int,
help="number of IVF centroids")
group = parser.add_argument_group('searching')
aa('--k', default=10, type=int, help='nb of nearest neighbors')
aa('--nprobe', default=10, help='nb of IVF lists to probe')
args = parser.parse_args()
print("args:", args)
rs = np.random.RandomState(123)
res = faiss.StandardGpuResources()
# Use an RMM pool memory resource for device allocations
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
rmm.mr.set_current_device_resource(mr)
def bench_train_milliseconds(trainVecs, ncols, nlist, use_cuvs):
config = faiss.GpuIndexIVFFlatConfig()
config.use_cuvs = use_cuvs
index = faiss.GpuIndexIVFFlat(res, ncols, nlist, faiss.METRIC_L2, config)
t0 = time.time()
index.train(trainVecs)
return 1000*(time.time() - t0)
#warmup
xw = rs.rand(nt, d)
bench_train_milliseconds(xw, d, args.nlist, True)
if args.bm_train:
print("=" * 40)
print("GPU Train Benchmarks")
print("=" * 40)
cuvs_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, True)
classical_gpu_train_time = bench_train_milliseconds(xt, d, args.nlist, False)
print("Method: IVFFlat, Operation: TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
d, args.nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
def bench_add_milliseconds(addVecs, q, use_cuvs):
# construct a GPU index using the same trained coarse quantizer
config = faiss.GpuIndexIVFFlatConfig()
config.use_cuvs = use_cuvs
index_gpu = faiss.GpuIndexIVFFlat(res, q, d, args.nlist, faiss.METRIC_L2, config)
assert(index_gpu.is_trained)
t0 = time.time()
index_gpu.add(addVecs)
return 1000*(time.time() - t0)
if args.bm_add:
print("=" * 40)
print("GPU Add Benchmarks")
print("=" * 40)
quantizer = faiss.IndexFlatL2(d)
idx_cpu = faiss.IndexIVFFlat(quantizer, d, args.nlist)
idx_cpu.train(xt)
cuvs_gpu_add_time = bench_add_milliseconds(xb, quantizer, True)
classical_gpu_add_time = bench_add_milliseconds(xb, quantizer, False)
print("Method: IVFFlat, Operation: ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
d, args.nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
co = faiss.GpuClonerOptions()
co.use_cuvs = use_cuvs
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
index_gpu.nprobe = nprobe
t0 = time.time()
index_gpu.search(queryVecs, k)
return 1000*(time.time() - t0)
if args.bm_search:
print("=" * 40)
print("GPU Search Benchmarks")
print("=" * 40)
idx_cpu = faiss.IndexIVFFlat(
faiss.IndexFlatL2(d), d, args.nlist)
idx_cpu.train(xt)
idx_cpu.add(xb)
cuvs_gpu_search_time = bench_search_milliseconds(
idx_cpu, xq, args.nprobe, args.k, True)
classical_gpu_search_time = bench_search_milliseconds(
idx_cpu, xq, args.nprobe, args.k, False)
print("Method: IVFFlat, Operation: SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, cuVS enabled GPU search time: %.3f milliseconds" % (
d, args.nlist, nb, nq, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))

View File

@@ -0,0 +1,187 @@
# @lint-ignore-every LICENSELINT
# Copyright (c) Meta Platforms, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import faiss
import time
import argparse
import rmm
import ctypes
try:
from faiss.contrib.datasets_fb import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
except ImportError:
from faiss.contrib.datasets import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
# ds = DatasetDeep1B(10**6)
# ds = DatasetBigANN(nb_M=1)
ds = DatasetSIFT1M()
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train()
nb, d = xb.shape
nq, d = xq.shape
nt, d = xt.shape
M = d / 2
######################################################
# Command-line parsing
######################################################
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('benchmarking options')
aa('--bm_train', default=True,
help='whether to benchmark train operation on GPU index')
aa('--bm_add', default=True,
help='whether to benchmark add operation on GPU index')
aa('--bm_search', default=True,
help='whether to benchmark search operation on GPU index')
group = parser.add_argument_group('IVF options')
aa('--nlist', default=1024, type=np.int64,
help="number of IVF centroids")
aa('--bits_per_code', default=8, type=np.int64, help='bits per code. Note that < 8 is only supported when cuVS is enabled')
group = parser.add_argument_group('searching')
aa('--k', default=10, type=int, help='nb of nearest neighbors')
aa('--nprobe', default=10, help='nb of IVF lists to probe')
args = parser.parse_args()
print("args:", args)
gt = gt[:, :args.k]
nlist = args.nlist
bits_per_code = args.bits_per_code
rs = np.random.RandomState(123)
res = faiss.StandardGpuResources()
# Use an RMM pool memory resource for device allocations
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
rmm.mr.set_current_device_resource(mr)
def eval_recall(neighbors, t):
speed = t * 1000 / nq
qps = 1000 / speed
corrects = (gt == neighbors).sum()
recall = corrects / (nq * args.k)
return recall, qps
def bench_train_milliseconds(trainVecs, use_cuvs):
config = faiss.GpuIndexIVFPQConfig()
config.use_cuvs = use_cuvs
index = faiss.GpuIndexIVFPQ(res, d, 1024, 32, 8, faiss.METRIC_L2, config)
t0 = time.time()
index.train(trainVecs)
return 1000*(time.time() - t0)
#warmup
xw = rs.rand(nt, d)
bench_train_milliseconds(xw, True)
if args.bm_train:
print("=" * 40)
print("GPU Train Benchmarks")
print("=" * 40)
cuvs_gpu_train_time = bench_train_milliseconds(xt, True)
classical_gpu_train_time = bench_train_milliseconds(xt, False)
print("TRAIN, dim: %d, nlist %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
d, nlist, nt, classical_gpu_train_time, cuvs_gpu_train_time))
def bench_add_milliseconds(addVecs, index_cpu, use_cuvs):
# construct a GPU index using the same trained coarse quantizer
config = faiss.GpuClonerOptions()
config.use_cuvs = use_cuvs
index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu, config)
assert(index_gpu.is_trained)
t0 = time.time()
index_gpu.add(addVecs)
return 1000*(time.time() - t0)
if args.bm_add:
print("=" * 40)
print("GPU Add Benchmarks")
print("=" * 40)
quantizer = faiss.IndexFlatL2(d)
index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
index_cpu.train(xt)
cuvs_gpu_add_time = bench_add_milliseconds(xb, index_cpu, True)
classical_gpu_add_time = bench_add_milliseconds(xb, index_cpu, False)
print("ADD, dim: %d, nlist %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
d, nlist, nb, classical_gpu_add_time, cuvs_gpu_add_time))
def bench_search_milliseconds(index, queryVecs, nprobe, k, use_cuvs):
co = faiss.GpuClonerOptions()
co.use_cuvs = use_cuvs
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
index_gpu.nprobe = nprobe
t0 = time.time()
_, I = index_gpu.search(queryVecs, k)
return I, 1000*(time.time() - t0)
# Search benchmarks: both indexes have identical IVF centroids and lists.
if args.bm_search:
print("=" * 40)
print("GPU Search Benchmarks")
print("=" * 40)
index_cpu = faiss.IndexIVFPQ(quantizer, d, 1024, 32, 8, faiss.METRIC_L2)
index_cpu.train(xt)
index_cpu.add(xb)
cuvs_indices, cuvs_gpu_search_time = bench_search_milliseconds(
index_cpu, xq, args.nprobe, args.k, True)
classical_gpu_indices, classical_gpu_search_time = bench_search_milliseconds(
index_cpu, xq, args.nprobe, args.k, False)
cuvs_recall, cuvs_qps = eval_recall(cuvs_indices, cuvs_gpu_search_time)
classical_recall, classical_qps = eval_recall(classical_gpu_indices, classical_gpu_search_time)
print("SEARCH, dim: %d, nlist: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU qps: %.3f, cuVS enabled GPU qps: %.3f" % (
d, nlist, nb, nq, args.nprobe, args.k, classical_qps, cuvs_qps))

View File

@@ -0,0 +1,35 @@
#! /usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""small test script to benchmark the SIMD implementation of the
distance computations for the additional metrics. Call eg. with L1 to
get L1 distance computations.
"""
import faiss
import sys
import time
d = 64
nq = 4096
nb = 16384
print("sample")
xq = faiss.randn((nq, d), 123)
xb = faiss.randn((nb, d), 123)
mt_name = "L2" if len(sys.argv) < 2 else sys.argv[1]
mt = getattr(faiss, "METRIC_" + mt_name)
print("distances")
t0 = time.time()
dis = faiss.pairwise_distances(xq, xb, mt)
t1 = time.time()
print("nq=%d nb=%d d=%d %s: %.3f s" % (nq, nb, d, mt_name, t1 - t0))

View File

@@ -0,0 +1,78 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import faiss
import numpy as np
def do_partition(n, qin, maxval=65536, seed=123, id_type='int64'):
print(
f"n={n} qin={qin} maxval={maxval} id_type={id_type} ",
end="\t", flush=True
)
# print("seed=", seed)
rs = np.random.RandomState(seed)
vals = rs.randint(maxval, size=n).astype('uint16')
ids = (rs.permutation(n) + 12345).astype(id_type)
sp = faiss.swig_ptr
tab_a = faiss.AlignedTableUint16()
faiss.copy_array_to_AlignedTable(vals, tab_a)
nrun = 2000
times = []
nerr = 0
stats = faiss.cvar.partition_stats
stats.reset()
for _run in range(nrun):
faiss.copy_array_to_AlignedTable(vals, tab_a)
t0 = time.time()
# print("tab a type", tab_a.get())
if type(qin) == int:
q = qin
faiss.CMax_uint16_partition_fuzzy(
tab_a.get(), sp(ids), n, q, q, None)
else:
q_min, q_max = qin
q = np.array([-1], dtype='uint64')
faiss.CMax_uint16_partition_fuzzy(
tab_a.get(), sp(ids), n,
q_min, q_max, sp(q)
)
q = q[0]
if not (q_min <= q <= q_max):
nerr += 1
t1 = time.time()
times.append(t1 - t0)
times = np.array(times[100:]) * 1000000
print(
f"times {times.mean():.3f} µs (± {times.std():.4f} µs) nerr={nerr} "
f"bissect {stats.bissect_cycles / 1e6:.3f} Mcy "
f"compress {stats.compress_cycles / 1e6:.3f} Mcy"
)
do_partition(200, (100, 100))
do_partition(200, (100, 150))
do_partition(2000, (1000, 1000))
do_partition(2000, (1000, 1500))
do_partition(20000, (10000, 10000))
do_partition(20000, (10000, 15000))
do_partition(200, (100, 100), id_type='int32')
do_partition(200, (100, 150), id_type='int32')
do_partition(2000, (1000, 1000), id_type='int32')
do_partition(2000, (1000, 1500), id_type='int32')
do_partition(20000, (10000, 10000), id_type='int32')
do_partition(20000, (10000, 15000), id_type='int32')

View File

@@ -0,0 +1,251 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import sys
import time
import numpy as np
import re
import faiss
from multiprocessing.pool import ThreadPool
from datasets import ivecs_read
# we mem-map the biggest files to avoid having them in memory all at
# once
def mmap_fvecs(fname):
x = np.memmap(fname, dtype='int32', mode='r')
d = x[0]
return x.view('float32').reshape(-1, d + 1)[:, 1:]
def mmap_bvecs(fname):
x = np.memmap(fname, dtype='uint8', mode='r')
d = x[:4].view('int32')[0]
return x.reshape(-1, d + 4)[:, 4:]
#################################################################
# Bookkeeping
#################################################################
dbname = sys.argv[1]
index_key = sys.argv[2]
parametersets = sys.argv[3:]
tmpdir = '/tmp/bench_polysemous'
if not os.path.isdir(tmpdir):
print("%s does not exist, creating it" % tmpdir)
os.mkdir(tmpdir)
#################################################################
# Prepare dataset
#################################################################
print("Preparing dataset", dbname)
if dbname.startswith('SIFT'):
# SIFT1M to SIFT1000M
dbsize = int(dbname[4:-1])
xb = mmap_bvecs('bigann/bigann_base.bvecs')
xq = mmap_bvecs('bigann/bigann_query.bvecs')
xt = mmap_bvecs('bigann/bigann_learn.bvecs')
# trim xb to correct size
xb = xb[:dbsize * 1000 * 1000]
gt = ivecs_read('bigann/gnd/idx_%dM.ivecs' % dbsize)
elif dbname == 'Deep1B':
xb = mmap_fvecs('deep1b/base.fvecs')
xq = mmap_fvecs('deep1b/deep1B_queries.fvecs')
xt = mmap_fvecs('deep1b/learn.fvecs')
# deep1B's train is is outrageously big
xt = xt[:10 * 1000 * 1000]
gt = ivecs_read('deep1b/deep1B_groundtruth.ivecs')
else:
print('unknown dataset', dbname, file=sys.stderr)
sys.exit(1)
print("sizes: B %s Q %s T %s gt %s" % (
xb.shape, xq.shape, xt.shape, gt.shape))
nq, d = xq.shape
nb, d = xb.shape
assert gt.shape[0] == nq
#################################################################
# Training
#################################################################
def choose_train_size(index_key):
# some training vectors for PQ and the PCA
n_train = 256 * 1000
if "IVF" in index_key:
matches = re.findall('IVF([0-9]+)', index_key)
ncentroids = int(matches[0])
n_train = max(n_train, 100 * ncentroids)
elif "IMI" in index_key:
matches = re.findall('IMI2x([0-9]+)', index_key)
nbit = int(matches[0])
n_train = max(n_train, 256 * (1 << nbit))
return n_train
def get_trained_index():
filename = "%s/%s_%s_trained.index" % (
tmpdir, dbname, index_key)
if not os.path.exists(filename):
index = faiss.index_factory(d, index_key)
n_train = choose_train_size(index_key)
xtsub = xt[:n_train]
print("Keeping %d train vectors" % xtsub.shape[0])
# make sure the data is actually in RAM and in float
xtsub = xtsub.astype('float32').copy()
index.verbose = True
t0 = time.time()
index.train(xtsub)
index.verbose = False
print("train done in %.3f s" % (time.time() - t0))
print("storing", filename)
faiss.write_index(index, filename)
else:
print("loading", filename)
index = faiss.read_index(filename)
return index
#################################################################
# Adding vectors to dataset
#################################################################
def rate_limited_imap(f, l):
'a thread pre-processes the next element'
pool = ThreadPool(1)
res = None
for i in l:
res_next = pool.apply_async(f, (i, ))
if res:
yield res.get()
res = res_next
yield res.get()
def matrix_slice_iterator(x, bs):
" iterate over the lines of x in blocks of size bs"
nb = x.shape[0]
block_ranges = [(i0, min(nb, i0 + bs))
for i0 in range(0, nb, bs)]
return rate_limited_imap(
lambda i01: x[i01[0]:i01[1]].astype('float32').copy(),
block_ranges)
def get_populated_index():
filename = "%s/%s_%s_populated.index" % (
tmpdir, dbname, index_key)
if not os.path.exists(filename):
index = get_trained_index()
i0 = 0
t0 = time.time()
for xs in matrix_slice_iterator(xb, 100000):
i1 = i0 + xs.shape[0]
print('\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0), end=' ')
sys.stdout.flush()
index.add(xs)
i0 = i1
print()
print("Add done in %.3f s" % (time.time() - t0))
print("storing", filename)
faiss.write_index(index, filename)
else:
print("loading", filename)
index = faiss.read_index(filename)
return index
#################################################################
# Perform searches
#################################################################
index = get_populated_index()
ps = faiss.ParameterSpace()
ps.initialize(index)
# make sure queries are in RAM
xq = xq.astype('float32').copy()
# a static C++ object that collects statistics about searches
ivfpq_stats = faiss.cvar.indexIVFPQ_stats
ivf_stats = faiss.cvar.indexIVF_stats
if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
if parametersets == ['autotune']:
faiss.omp_set_num_threads(1)
# setup the Criterion object: optimize for 1-R@1
crit = faiss.OneRecallAtRCriterion(nq, 1)
# by default, the criterion will request only 1 NN
crit.nnn = 100
crit.set_groundtruth(None, gt.astype('int64'))
# then we let Faiss find the optimal parameters by itself
print("exploring operating points")
t0 = time.time()
op = ps.explore(index, xq, crit)
print("Done in %.3f s, available OPs:" % (time.time() - t0))
# opv is a C++ vector, so it cannot be accessed like a Python array
opv = op.optimal_pts
print("%-40s 1-R@1 time" % "Parameters")
for i in range(opv.size()):
opt = opv.at(i)
print("%-40s %.4f %7.3f" % (opt.key, opt.perf, opt.t))
else:
# we do queries in a single thread
faiss.omp_set_num_threads(1)
print(' ' * len(parametersets[0]), '\t', 'R@1 R@10 R@100 time %pass')
for param in parametersets:
print(param, '\t', end=' ')
sys.stdout.flush()
ps.set_index_parameters(index, param)
t0 = time.time()
ivfpq_stats.reset()
ivf_stats.reset()
D, I = index.search(xq, 100)
t1 = time.time()
for rank in 1, 10, 100:
n_ok = (I[:, :rank] == gt[:, :1]).sum()
print("%.4f" % (n_ok / float(nq)), end=' ')
print("%8.3f " % ((t1 - t0) * 1000.0 / nq), end=' ')
print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import time
import numpy as np
import faiss
from datasets import load_sift1M, evaluate
print("load data")
xb, xq, xt, gt = load_sift1M()
nq, d = xq.shape
# index with 16 subquantizers, 8 bit each
index = faiss.IndexPQ(d, 16, 8)
index.do_polysemous_training = True
index.verbose = True
print("train")
index.train(xt)
print("add vectors to index")
index.add(xb)
nt = 1
faiss.omp_set_num_threads(1)
print("PQ baseline", end=' ')
index.search_type = faiss.IndexPQ.ST_PQ
t, r = evaluate(index, xq, gt, 1)
print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
print("Polysemous", ht, end=' ')
index.search_type = faiss.IndexPQ.ST_polysemous
index.polysemous_ht = ht
t, r = evaluate(index, xq, gt, 1)
print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import os
import numpy as np
import faiss
os.system("grep -m1 'model name' < /proc/cpuinfo")
def format_tab(x):
return "\n".join("\t".join("%g" % xi for xi in row) for row in x)
def run_bench(d, dsub, nbit=8, metric=None):
M = d // dsub
pq = faiss.ProductQuantizer(d, M, nbit)
pq.train(faiss.randn((max(1000, pq.ksub * 50), d), 123))
sp = faiss.swig_ptr
times = []
nrun = 100
print(f"d={d} dsub={dsub} ksub={pq.ksub}", end="\t")
res = []
for nx in 1, 10, 100:
x = faiss.randn((nx, d), 555)
times = []
for run in range(nrun):
t0 = time.time()
new_tab = np.zeros((nx, M, pq.ksub), "float32")
if metric == faiss.METRIC_INNER_PRODUCT:
pq.compute_inner_prod_tables(nx, sp(x), sp(new_tab))
elif metric == faiss.METRIC_L2:
pq.compute_distance_tables(nx, sp(x), sp(new_tab))
else:
assert False
t1 = time.time()
if run >= nrun // 5: # the rest is considered warmup
times.append((t1 - t0))
times = np.array(times) * 1000
print(f"nx={nx}: {np.mean(times):.3f} ms (± {np.std(times):.4f})",
end="\t")
res.append(times.mean())
print()
return res
# for have_threads in True, False:
for have_threads in False, True:
if have_threads:
# good config for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
nthread = 32
else:
nthread = 1
faiss.omp_set_num_threads(nthread)
for metric in faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2:
print("============= nthread=", nthread, "metric=", metric)
allres = []
for dsub in 2, 4, 8:
for nbit in 4, 8:
for M in 8, 20:
res = run_bench(M * dsub, dsub, nbit, metric)
allres.append(res)
allres = np.array(allres)
print("formated result:")
print(format_tab(allres))

View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import faiss
import time
import random
import faiss.contrib.datasets
# copied from benchs/bench_all_ivf/bench_all_ivf.py
def unwind_index_ivf(index):
if isinstance(index, faiss.IndexPreTransform):
assert index.chain.size() == 1
vt = index.chain.at(0)
index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
assert vt2 is None
return index_ivf, vt
if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
return unwind_index_ivf(faiss.downcast_index(index.base_index))
if isinstance(index, faiss.IndexIVF):
return index, None
else:
return None, None
def test_bigann10m(index_file, index_parameters):
ds = faiss.contrib.datasets.DatasetBigANN(nb_M=10)
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
nb, d = xb.shape
nq, d = xq.shape
print("Reading index {}".format(index_file))
index = faiss.read_index(index_file)
ps = faiss.ParameterSpace()
ps.initialize(index)
index_ivf, vec_transform = unwind_index_ivf(index)
print('params regular transp_centroids regular R@1 R@10 R@100')
for index_parameter in index_parameters:
ps.set_index_parameters(index, index_parameter)
print(index_parameter.ljust(70), end=' ')
k = 100
# warmup
D, I = index.search(xq, k)
# warmup
D, I = index.search(xq, k)
# eval
t2_0 = time.time()
D, I = index.search(xq, k)
t2_1 = time.time()
# eval
index_ivf.pq.sync_transposed_centroids()
t3_0 = time.time()
D, I = index.search(xq, k)
t3_1 = time.time()
# eval
index_ivf.pq.clear_transposed_centroids()
t4_0 = time.time()
D, I = index.search(xq, k)
t4_1 = time.time()
print(" %9.5f " % (t2_1 - t2_0), end=' ')
print(" %9.5f " % (t3_1 - t3_0), end=' ')
print(" %9.5f " % (t4_1 - t4_0), end=' ')
for rank in 1, 10, 100:
n_ok = (I[:, :rank] == gt[:, :1]).sum()
print("%.4f" % (n_ok / float(nq)), end=' ')
print()
if __name__ == "__main__":
faiss.contrib.datasets.dataset_basedir = '/home/aguzhva/ANN_SIFT1B/'
# represents OPQ32_128,IVF65536_HNSW32,PQ32 index
index_file_1 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/hnsw32/.faissindex"
nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
quantizer_efsearch_values = [4, 8, 16, 32, 64, 128, 256, 512]
ht_values = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 256]
# represents OPQ32_128,IVF65536(IVF256,PQHDx4fs,RFlat),PQ32 index
index_file_2 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/pq4/.faissindex"
quantizer_k_factor_rf_values = [1, 2, 4, 8, 16, 32, 64]
quantizer_nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128]
# test the first index
index_parameters_1 = []
for _ in range(0, 20):
nprobe = random.choice(nprobe_values)
quantizer_efsearch = random.choice(quantizer_efsearch_values)
ht = random.choice(ht_values)
index_parameters_1.append(
"nprobe={},quantizer_efSearch={},ht={}".format(
nprobe,
quantizer_efsearch,
ht)
)
test_bigann10m(index_file_1, index_parameters_1)
# test the second index
index_parameters_2 = []
for _ in range(0, 20):
nprobe = random.choice(nprobe_values)
quantizer_k_factor_rf = random.choice(quantizer_k_factor_rf_values)
quantizer_nprobe = random.choice(quantizer_nprobe_values)
ht = random.choice(ht_values)
index_parameters_2.append(
"nprobe={},quantizer_k_factor_rf={},quantizer_nprobe={},ht={}".format(
nprobe,
quantizer_k_factor_rf,
quantizer_nprobe,
ht)
)
test_bigann10m(index_file_2, index_parameters_2)

View File

@@ -0,0 +1,157 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
import faiss
import time
import numpy as np
try:
from faiss.contrib.datasets_fb import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
except ImportError:
from faiss.contrib.datasets import \
DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
def eval_codec(q, xq, xb, gt):
t0 = time.time()
codes = q.compute_codes(xb)
t1 = time.time()
xb_decoded = q.decode(codes)
recons_err = ((xb - xb_decoded) ** 2).sum() / xb.shape[0]
# for compatibility with the codec benchmarks
err_compat = np.linalg.norm(xb - xb_decoded, axis=1).mean()
xq_decoded = q.decode(q.compute_codes(xq))
D, I = faiss.knn(xq_decoded, xb_decoded, 1)
recall = (I[:, 0] == gt[:, 0]).sum() / nq
print(
f"\tencode time: {t1 - t0:.3f} reconstruction error: {recons_err:.3f} "
f"1-recall@1: {recall:.4f} recons_err_compat {err_compat:.3f}")
def eval_quantizer(q, xq, xb, gt, xt, variants=None):
if variants is None:
variants = [(None, None)]
t0 = time.time()
q.train(xt)
t1 = time.time()
train_t = t1 - t0
print(f'\ttraining time: {train_t:.3f} s')
for name, val in variants:
if name is not None:
print(f"{name}={val}")
if isinstance(q, faiss.ProductAdditiveQuantizer):
for i in range(q.nsplits):
subq = faiss.downcast_Quantizer(q.subquantizer(i))
getattr(subq, name)
setattr(subq, name, val)
else:
getattr(q, name) # make sure field exists
setattr(q, name, val)
eval_codec(q, xq, xb, gt)
todo = sys.argv[1:]
if len(todo) > 0 and "deep1M" in todo[0]:
ds = DatasetDeep1B(10**6)
del todo[0]
elif len(todo) > 0 and "bigann1M" in todo[0]:
ds = DatasetBigANN(nb_M=1)
del todo[0]
else:
ds = DatasetSIFT1M()
if len(todo) > 0:
if todo[0].count("x") == 1:
M, nbits = [int(x) for x in todo[0].split("x")]
del todo[0]
elif todo[0].count("x") == 2:
nsplits, Msub, nbits = [int(x) for x in todo[0].split("x")]
M = nsplits * Msub
del todo[0]
maxtrain = max(100 << nbits, 10**5)
print(f"eval on {M}x{nbits} maxtrain={maxtrain}")
xq = ds.get_queries()
xb = ds.get_database()
gt = ds.get_groundtruth()
xt = ds.get_train(maxtrain=maxtrain)
nb, d = xb.shape
nq, d = xq.shape
nt, d = xt.shape
# fastest to slowest
if 'lsq-gpu' in todo:
lsq = faiss.LocalSearchQuantizer(d, M, nbits)
ngpus = faiss.get_num_gpus()
lsq.icm_encoder_factory = faiss.GpuIcmEncoderFactory(ngpus)
lsq.verbose = True
eval_quantizer(lsq, xb, xt, 'lsq-gpu')
if 'pq' in todo:
pq = faiss.ProductQuantizer(d, M, nbits)
print("===== PQ")
eval_quantizer(pq, xq, xb, gt, xt)
if 'opq' in todo:
d2 = ((d + M - 1) // M) * M
print("OPQ d2=", d2)
opq = faiss.OPQMatrix(d, M, d2)
opq.train(xt)
xq2 = opq.apply(xq)
xb2 = opq.apply(xb)
xt2 = opq.apply(xt)
pq = faiss.ProductQuantizer(d2, M, nbits)
print("===== PQ")
eval_quantizer(pq, xq2, xb2, gt, xt2)
if 'prq' in todo:
print(f"===== PRQ{nsplits}x{Msub}x{nbits}")
prq = faiss.ProductResidualQuantizer(d, nsplits, Msub, nbits)
variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
eval_quantizer(prq, xq, xb, gt, xt, variants=variants)
if 'plsq' in todo:
print(f"===== PLSQ{nsplits}x{Msub}x{nbits}")
plsq = faiss.ProductLocalSearchQuantizer(d, nsplits, Msub, nbits)
variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
eval_quantizer(plsq, xq, xb, gt, xt, variants=variants)
if 'rq' in todo:
print("===== RQ")
rq = faiss.ResidualQuantizer(d, M, nbits, )
rq.max_beam_size
rq.max_beam_size = 30 # for compatibility with older runs
# rq.train_type = faiss.ResidualQuantizer.Train_default
# rq.verbose = True
variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
if 'rq_lut' in todo:
print("===== RQ")
rq = faiss.ResidualQuantizer(d, M, nbits, )
rq.max_beam_size
rq.max_beam_size = 30 # for compatibility with older runs
rq.use_beam_LUT
rq.use_beam_LUT = 1
# rq.train_type = faiss.ResidualQuantizer.Train_default
# rq.verbose = True
variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32, 64)]
eval_quantizer(rq, xq, xb, gt, xt, variants=variants)
if 'lsq' in todo:
print("===== LSQ")
lsq = faiss.LocalSearchQuantizer(d, M, nbits)
variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
eval_quantizer(lsq, xq, xb, gt, xt, variants=variants)

View File

@@ -0,0 +1,82 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
import numpy as np
import faiss
from datasets import load_sift1M
print("load data")
xb, xq, xt, gt = load_sift1M()
nq, d = xq.shape
ncent = 256
variants = [(name, getattr(faiss.ScalarQuantizer, name))
for name in dir(faiss.ScalarQuantizer)
if name.startswith('QT_')]
quantizer = faiss.IndexFlatL2(d)
# quantizer.add(np.zeros((1, d), dtype='float32'))
if False:
for name, qtype in [('flat', 0)] + variants:
print("============== test", name)
t0 = time.time()
if name == 'flat':
index = faiss.IndexIVFFlat(quantizer, d, ncent,
faiss.METRIC_L2)
else:
index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
qtype, faiss.METRIC_L2)
index.nprobe = 16
print("[%.3f s] train" % (time.time() - t0))
index.train(xt)
print("[%.3f s] add" % (time.time() - t0))
index.add(xb)
print("[%.3f s] search" % (time.time() - t0))
D, I = index.search(xq, 100)
print("[%.3f s] eval" % (time.time() - t0))
for rank in 1, 10, 100:
n_ok = (I[:, :rank] == gt[:, :1]).sum()
print("%.4f" % (n_ok / float(nq)), end=' ')
print()
if True:
for name, qtype in variants:
print("============== test", name)
for rsname, vals in [('RS_minmax',
[-0.4, -0.2, -0.1, -0.05, 0.0, 0.1, 0.5]),
('RS_meanstd', [0.8, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0]),
('RS_quantiles', [0.02, 0.05, 0.1, 0.15]),
('RS_optim', [0.0])]:
for val in vals:
print("%-15s %5g " % (rsname, val), end=' ')
index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
qtype, faiss.METRIC_L2)
index.nprobe = 16
index.sq.rangestat = getattr(faiss.ScalarQuantizer,
rsname)
index.rangestat_arg = val
index.train(xt)
index.add(xb)
t0 = time.time()
D, I = index.search(xq, 100)
t1 = time.time()
for rank in 1, 10, 100:
n_ok = (I[:, :rank] == gt[:, :1]).sum()
print("%.4f" % (n_ok / float(nq)), end=' ')
print(" %.3f s" % (t1 - t0))

View File

@@ -0,0 +1,84 @@
#! /usr/bin/env python2
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import numpy as np
import faiss
import time
swig_ptr = faiss.swig_ptr
if False:
a = np.arange(10, 14).astype('float32')
b = np.arange(20, 24).astype('float32')
faiss.fvec_inner_product (swig_ptr(a), swig_ptr(b), 4)
1/0
xd = 100
yd = 1000000
np.random.seed(1234)
faiss.omp_set_num_threads(1)
print('xd=%d yd=%d' % (xd, yd))
print('Running inner products test..')
for d in 3, 4, 12, 36, 64:
x = faiss.rand(xd * d).reshape(xd, d)
y = faiss.rand(yd * d).reshape(yd, d)
distances = np.empty((xd, yd), dtype='float32')
t0 = time.time()
for i in range(xd):
faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
swig_ptr(x[i]),
swig_ptr(y),
d, yd)
t1 = time.time()
# sparse verification
ntry = 100
num, denom = 0, 0
for t in range(ntry):
xi = np.random.randint(xd)
yi = np.random.randint(yd)
num += abs(distances[xi, yi] - np.dot(x[xi], y[yi]))
denom += abs(distances[xi, yi])
print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))
print('Running L2sqr test..')
for d in 3, 4, 12, 36, 64:
x = faiss.rand(xd * d).reshape(xd, d)
y = faiss.rand(yd * d).reshape(yd, d)
distances = np.empty((xd, yd), dtype='float32')
t0 = time.time()
for i in range(xd):
faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
swig_ptr(x[i]),
swig_ptr(y),
d, yd)
t1 = time.time()
# sparse verification
ntry = 100
num, denom = 0, 0
for t in range(ntry):
xi = np.random.randint(xd)
yi = np.random.randint(yd)
num += abs(distances[xi, yi] - np.sum((x[xi] - y[yi]) ** 2))
denom += abs(distances[xi, yi])
print('d=%d t=%.3f s diff=%g' % (d, t1 - t0, num / denom))

View File

@@ -0,0 +1,45 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import sys
import time
import numpy as np
def ivecs_read(fname):
a = np.fromfile(fname, dtype='int32')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
def load_sift1M():
print("Loading sift1M...", end='', file=sys.stderr)
xt = fvecs_read("sift1M/sift_learn.fvecs")
xb = fvecs_read("sift1M/sift_base.fvecs")
xq = fvecs_read("sift1M/sift_query.fvecs")
gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
print("done", file=sys.stderr)
return xb, xq, xt, gt
def evaluate(index, xq, gt, k):
nq = xq.shape[0]
t0 = time.time()
D, I = index.search(xq, k) # noqa: E741
t1 = time.time()
recalls = {}
i = 1
while i <= k:
recalls[i] = (I[:, :i] == gt[:, :1]).sum() / float(nq)
i *= 10
return (t1 - t0) * 1000.0 / nq, recalls

View File

@@ -0,0 +1,194 @@
# Distributed on-disk index for 1T-scale datasets
This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
All the code is in python 3 (and not compatible with Python 2).
The current code uses the Deep1B dataset for demonstration purposes, but can scale to 1000x larger.
To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts.
The cluster commands are written for the Slurm batch scheduling system.
Hopefully, changing to another type of scheduler should be quite straightforward.
## Distributed k-means
To cluster 500M vectors to 10M centroids, it is useful to have a distributed k-means implementation.
The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment.
The master/client then synthesizes the results and updates the centroids.
The distributed k-means implementation here is based on 3 files:
- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation.
The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient.
It relies on a `DatasetAssign` object that does the assignment to centroids, which is the bulk of the computation.
The object can be a Faiss CPU index, a GPU index or a set of remote GPU or CPU indexes.
- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster.
The distributed k-means works with a Python install that contains faiss and scipy (for sparse matrices).
It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set.
The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem.
The file is also assumed to be accessible from all server machines with eg. a distributed file system.
### Local tests
Edit `distributed_kmeans.py` to point `testdata` to your local copy of the dataset.
Then, 4 levels of sanity check can be run:
```bash
# reference Faiss C++ run
python distributed_kmeans.py --test 0
# using the Python implementation
python distributed_kmeans.py --test 1
# use the dispatch object (on local datasets)
python distributed_kmeans.py --test 2
# same, with GPUs
python distributed_kmeans.py --test 3
```
The output should look like [This gist](https://gist.github.com/mdouze/ffa01fe666a9325761266fe55ead72ad).
### Distributed sanity check
To run the distributed k-means, `distributed_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option).
Edit the top of `run_on_cluster.bash` to set the path of the data to cluster.
Sanity checks can be run with
```bash
# non distributed baseline
bash run_on_cluster.bash test_kmeans_0
# using all the machine's GPUs
bash run_on_cluster.bash test_kmeans_1
# distributed run, with one local server per GPU
bash run_on_cluster.bash test_kmeans_2
```
The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol.
The output should look like [this gist](https://gist.github.com/mdouze/5b2dc69b74579ecff04e1686a277d32e).
### Distributed run
The way the script can be distributed depends on the cluster's scheduling system.
Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of machines and start the same executable on all of them.
The command
```bash
bash run_on_cluster.bash slurm_distributed_kmeans
```
asks SLURM for 5 machines with 4 GPUs each with the `srun` command.
All 5 machines run the script with the `slurm_within_kmeans_server` option.
They determine the number of servers and their own server id via the `SLURM_NPROCS` and `SLURM_PROCID` environment variables.
All machines start `distributed_kmeans.py` in server mode for the slice of the dataset they are responsible for.
In addition, the machine #0 also starts the client.
The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`.
It connects to all clients and performs the clustering.
The output should look like [this gist](https://gist.github.com/mdouze/8d25e89fb4af5093057cae0f917da6cd).
### Run used for deep1B
For the real run, we run the clustering on 50M vectors to 1M centroids.
This is just a matter of using as many machines / GPUs as possible in setting the output centroids with the `--out filename` option.
Then run
```bash
bash run_on_cluster.bash deep1b_clustering
```
The last lines of output read like:
```bash
Iteration 19 (898.92 s, search 875.71 s): objective=1.33601e+07 imbalance=1.303 nsplit=0
0: writing centroids to /checkpoint/matthijs/ondisk_distributed/1M_centroids.npy
```
This means that the total training time was 899s, of which 876s were used for computation.
However, the computation includes the I/O overhead to the assignment servers.
In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage.
This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI).
However, it is a simple implementation that should run on most clusters.
## Making the trained index
After the centroids are obtained, an empty trained index must be constructed.
This is done by:
- applying a pre-processing stage (a random rotation) to balance the dimensions of the vectors. This can be done after clustering, the clusters are just rotated as well.
- wrapping the centroids into a HNSW index to speed up the CPU-based assignment of vectors
- training the 6-bit scalar quantizer used to encode the vectors
This is performed by the script [`make_trained_index.py`](make_trained_index.py).
## Building the index by slices
We call the slices "vslices" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across database partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index.
There are 200 slices of 5M vectors each for Deep1B.
It can be run in a brute-force parallel fashion, there is no constraint on ordering.
To run the script in parallel on a slurm cluster, use:
```bash
bash run_on_cluster.bash make_index_vslices
```
For a real dataset, the data would be read from a DBMS.
In that case, reading the data and indexing it in parallel is worthwhile because reading is very slow.
## Splitting across inverted lists
The 200 slices need to be merged together.
This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice.
We slice the inverted lists into 50 horizontal slices.
This is run with
```bash
bash run_on_cluster.bash make_index_hslices
```
## Querying the index
At this point the index is ready.
The horizontal slices need to be loaded in the right order and combined into an index to be usable.
This is done in the [combined_index.py](combined_index.py) script.
It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched.
To test, run:
```bash
python combined_index.py
```
The output should look like:
```bash
(faiss_1.5.2) matthijs@devfair0144:~/faiss_versions/faiss_1Tcode/faiss/benchs/distributed_ondisk$ python combined_index.py
reading /checkpoint/matthijs/ondisk_distributed//hslices/slice49.faissindex
loading empty index /checkpoint/matthijs/ondisk_distributed/trained.faissindex
replace invlists
loaded index of size 1000000000
nprobe=1 1-recall@1=0.2904 t=12.35s
nnprobe=10 1-recall@1=0.6499 t=17.67s
nprobe=100 1-recall@1=0.8673 t=29.23s
nprobe=1000 1-recall@1=0.9132 t=129.58s
```
ie. searching is a lot slower than from RAM.
## Distributed query
To reduce the bandwidth required from the machine that does the queries, it is possible to split the search across several search servers.
This way, only the effective results are returned to the main machine.
The search client and server are implemented in [`search_server.py`](search_server.py).
It can be used as a script to start a search server for `CombinedIndexDeep1B` or as a module to load the clients.
The search servers can be started with
```bash
bash run_on_cluster.bash run_search_servers
```
(adjust to the number of servers that can be used).
Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py).
It connects to the servers and assigns subsets of inverted lists to visit to each of them.
A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5).
The number in MiB indicates the amount of data that is read from disk to perform the search.
In this case, the scale of the dataset is too small for the distributed search to have much impact, but on datasets > 10x larger, the difference becomes more significant.
## Conclusion
This code contains the core components to make an index that scales up to 1T vectors.
There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import faiss
import numpy as np
class CombinedIndex:
"""
combines a set of inverted lists into a hstack
masks part of those lists
adds these inverted lists to an empty index that contains
the info on how to perform searches
"""
def __init__(self, invlist_fnames, empty_index_fname,
masked_index_fname=None):
self.indexes = indexes = []
ilv = faiss.InvertedListsPtrVector()
for fname in invlist_fnames:
if os.path.exists(fname):
print('reading', fname, end='\r', flush=True)
index = faiss.read_index(fname)
indexes.append(index)
il = faiss.extract_index_ivf(index).invlists
else:
raise AssertionError
ilv.push_back(il)
print()
self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
if masked_index_fname:
self.big_il_base = self.big_il
print('loading', masked_index_fname)
self.masked_index = faiss.read_index(
masked_index_fname,
faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
self.big_il = faiss.MaskedInvertedLists(
faiss.extract_index_ivf(self.masked_index).invlists,
self.big_il_base)
print('loading empty index', empty_index_fname)
self.index = faiss.read_index(empty_index_fname)
ntotal = self.big_il.compute_ntotal()
print('replace invlists')
index_ivf = faiss.extract_index_ivf(self.index)
index_ivf.replace_invlists(self.big_il, False)
index_ivf.ntotal = self.index.ntotal = ntotal
index_ivf.parallel_mode = 1 # seems reasonable to do this all the time
quantizer = faiss.downcast_index(index_ivf.quantizer)
quantizer.hnsw.efSearch = 1024
############################################################
# Expose fields and functions of the index as methods so that they
# can be called by RPC
def search(self, x, k):
return self.index.search(x, k)
def range_search(self, x, radius):
return self.index.range_search(x, radius)
def transform_and_assign(self, xq):
index = self.index
if isinstance(index, faiss.IndexPreTransform):
assert index.chain.size() == 1
vt = index.chain.at(0)
xq = vt.apply_py(xq)
# perform quantization
index_ivf = faiss.extract_index_ivf(index)
quantizer = index_ivf.quantizer
coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe)
return xq, list_nos, coarse_dis
def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k):
index_ivf = faiss.extract_index_ivf(self.index)
n, d = xq.shape
assert d == index_ivf.d
n2, d2 = list_nos.shape
assert list_nos.shape == coarse_dis.shape
assert n2 == n
assert d2 == index_ivf.nprobe
D = np.empty((n, k), dtype='float32')
I = np.empty((n, k), dtype='int64')
index_ivf.search_preassigned(
n, faiss.swig_ptr(xq), k,
faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
faiss.swig_ptr(D), faiss.swig_ptr(I), False)
return D, I
def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius):
index_ivf = faiss.extract_index_ivf(self.index)
n, d = xq.shape
assert d == index_ivf.d
n2, d2 = list_nos.shape
assert list_nos.shape == coarse_dis.shape
assert n2 == n
assert d2 == index_ivf.nprobe
res = faiss.RangeSearchResult(n)
index_ivf.range_search_preassigned(
n, faiss.swig_ptr(xq), radius,
faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
res)
lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
nd = int(lims[-1])
D = faiss.rev_swig_ptr(res.distances, nd).copy()
I = faiss.rev_swig_ptr(res.labels, nd).copy()
return lims, D, I
def set_nprobe(self, nprobe):
index_ivf = faiss.extract_index_ivf(self.index)
index_ivf.nprobe = nprobe
def set_parallel_mode(self, pm):
index_ivf = faiss.extract_index_ivf(self.index)
index_ivf.parallel_mode = pm
def get_ntotal(self):
return self.index.ntotal
def set_prefetch_nthread(self, nt):
for idx in self.indexes:
il = faiss.downcast_InvertedLists(
faiss.extract_index_ivf(idx).invlists)
il.prefetch_nthread
il.prefetch_nthread = nt
def set_omp_num_threads(self, nt):
faiss.omp_set_num_threads(nt)
class CombinedIndexDeep1B(CombinedIndex):
""" loads a CombinedIndex with the data from the big photodna index """
def __init__(self):
# set some paths
workdir = "/checkpoint/matthijs/ondisk_distributed/"
# empty index with the proper quantizer
indexfname = workdir + 'trained.faissindex'
# index that has some invlists that override the big one
masked_index_fname = None
invlist_fnames = [
'%s/hslices/slice%d.faissindex' % (workdir, i)
for i in range(50)
]
CombinedIndex.__init__(self, invlist_fnames, indexfname, masked_index_fname)
def ivecs_read(fname):
a = np.fromfile(fname, dtype='int32')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
if __name__ == '__main__':
import time
ci = CombinedIndexDeep1B()
print('loaded index of size ', ci.index.ntotal)
deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
gt = ivecs_read(gt_fname)
for nprobe in 1, 10, 100, 1000:
ci.set_nprobe(nprobe)
t0 = time.time()
D, I = ci.search(xq, 100)
t1 = time.time()
print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
t1 - t0
))

View File

@@ -0,0 +1,239 @@
#! /usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Simple distributed kmeans implementation Relies on an abstraction
for the training matrix, that can be sharded over several machines.
"""
import os
import sys
import argparse
import numpy as np
import faiss
from multiprocessing.pool import ThreadPool
from faiss.contrib import rpc
from faiss.contrib.datasets import SyntheticDataset
from faiss.contrib.vecs_io import bvecs_mmap, fvecs_mmap
from faiss.contrib.clustering import DatasetAssign, DatasetAssignGPU, kmeans
class DatasetAssignDispatch:
"""dispatches to several other DatasetAssigns and combines the
results"""
def __init__(self, xes, in_parallel):
self.xes = xes
self.d = xes[0].dim()
if not in_parallel:
self.imap = map
else:
self.pool = ThreadPool(len(self.xes))
self.imap = self.pool.imap
self.sizes = list(map(lambda x: x.count(), self.xes))
self.cs = np.cumsum([0] + self.sizes)
def count(self):
return self.cs[-1]
def dim(self):
return self.d
def get_subset(self, indices):
res = np.zeros((len(indices), self.d), dtype='float32')
nos = np.searchsorted(self.cs[1:], indices, side='right')
def handle(i):
mask = nos == i
sub_indices = indices[mask] - self.cs[i]
subset = self.xes[i].get_subset(sub_indices)
res[mask] = subset
list(self.imap(handle, range(len(self.xes))))
return res
def assign_to(self, centroids, weights=None):
src = self.imap(
lambda x: x.assign_to(centroids, weights),
self.xes
)
I = []
D = []
sum_per_centroid = None
for Ii, Di, sum_per_centroid_i in src:
I.append(Ii)
D.append(Di)
if sum_per_centroid is None:
sum_per_centroid = sum_per_centroid_i
else:
sum_per_centroid += sum_per_centroid_i
return np.hstack(I), np.hstack(D), sum_per_centroid
class AssignServer(rpc.Server):
""" Assign version that can be exposed via RPC """
def __init__(self, s, assign, log_prefix=''):
rpc.Server.__init__(self, s, log_prefix=log_prefix)
self.assign = assign
def __getattr__(self, f):
return getattr(self.assign, f)
def do_test(todo):
testdata = '/datasets01_101/simsearch/041218/bigann/bigann_learn.bvecs'
if os.path.exists(testdata):
x = bvecs_mmap(testdata)
else:
print("using synthetic dataset")
ds = SyntheticDataset(128, 100000, 0, 0)
x = ds.get_train()
# bad distribution to stress-test split code
xx = x[:100000].copy()
xx[:50000] = x[0]
todo = sys.argv[1:]
if "0" in todo:
# reference C++ run
km = faiss.Kmeans(x.shape[1], 1000, niter=20, verbose=True)
km.train(xx.astype('float32'))
if "1" in todo:
# using the Faiss c++ implementation
data = DatasetAssign(xx)
kmeans(1000, data, 20)
if "2" in todo:
# use the dispatch object (on local datasets)
data = DatasetAssignDispatch([
DatasetAssign(xx[20000 * i : 20000 * (i + 1)])
for i in range(5)
], False
)
kmeans(1000, data, 20)
if "3" in todo:
# same, with GPU
ngpu = faiss.get_num_gpus()
print('using %d GPUs' % ngpu)
data = DatasetAssignDispatch([
DatasetAssignGPU(xx[100000 * i // ngpu: 100000 * (i + 1) // ngpu], i)
for i in range(ngpu)
], True
)
kmeans(1000, data, 20)
def main():
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('general options')
aa('--test', default='', help='perform tests (comma-separated numbers)')
aa('--k', default=0, type=int, help='nb centroids')
aa('--seed', default=1234, type=int, help='random seed')
aa('--niter', default=20, type=int, help='nb iterations')
aa('--gpu', default=-2, type=int, help='GPU to use (-2:none, -1: all)')
group = parser.add_argument_group('I/O options')
aa('--indata', default='',
help='data file to load (supported formats fvecs, bvecs, npy')
aa('--i0', default=0, type=int, help='first vector to keep')
aa('--i1', default=-1, type=int, help='last vec to keep + 1')
aa('--out', default='', help='file to store centroids')
aa('--store_each_iteration', default=False, action='store_true',
help='store centroid checkpoints')
group = parser.add_argument_group('server options')
aa('--server', action='store_true', default=False, help='run server')
aa('--port', default=12345, type=int, help='server port')
aa('--when_ready', default=None, help='store host:port to this file when ready')
aa('--ipv4', default=False, action='store_true', help='force ipv4')
group = parser.add_argument_group('client options')
aa('--client', action='store_true', default=False, help='run client')
aa('--servers', default='', help='list of server:port separated by spaces')
args = parser.parse_args()
if args.test:
do_test(args.test.split(','))
return
# prepare data matrix (either local or remote)
if args.indata:
print('loading ', args.indata)
if args.indata.endswith('.bvecs'):
x = bvecs_mmap(args.indata)
elif args.indata.endswith('.fvecs'):
x = fvecs_mmap(args.indata)
elif args.indata.endswith('.npy'):
x = np.load(args.indata, mmap_mode='r')
else:
raise AssertionError
if args.i1 == -1:
args.i1 = len(x)
x = x[args.i0:args.i1]
if args.gpu == -2:
data = DatasetAssign(x)
else:
print('moving to GPU')
data = DatasetAssignGPU(x, args.gpu)
elif args.client:
print('connecting to servers')
def connect_client(hostport):
host, port = hostport.split(':')
port = int(port)
print('connecting %s:%d' % (host, port))
client = rpc.Client(host, port, v6=not args.ipv4)
print('client %s:%d ready' % (host, port))
return client
hostports = args.servers.strip().split(' ')
# pool = ThreadPool(len(hostports))
data = DatasetAssignDispatch(
list(map(connect_client, hostports)),
True
)
else:
raise AssertionError
if args.server:
print('starting server')
log_prefix = f"{rpc.socket.gethostname()}:{args.port}"
rpc.run_server(
lambda s: AssignServer(s, data, log_prefix=log_prefix),
args.port, report_to_file=args.when_ready,
v6=not args.ipv4)
else:
print('running kmeans')
centroids = kmeans(args.k, data, niter=args.niter, seed=args.seed,
checkpoint=args.out if args.store_each_iteration else None)
if args.out != '':
print('writing centroids to', args.out)
np.save(args.out, centroids)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,70 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import faiss
import numpy as np
import time
import rpc
import sys
import combined_index
import search_server
hostnames = sys.argv[1:]
print("Load local index")
ci = combined_index.CombinedIndexDeep1B()
print("connect to clients")
clients = []
for host in hostnames:
client = rpc.Client(host, 12012, v6=False)
clients.append(client)
# check if all servers respond
print("sizes seen by servers:", [cl.get_ntotal() for cl in clients])
# aggregate all clients into a one that uses them all for speed
# note that it also requires a local index ci
sindex = search_server.SplitPerListIndex(ci, clients)
sindex.verbose = True
# set reasonable parameters
ci.set_parallel_mode(1)
ci.set_prefetch_nthread(0)
ci.set_omp_num_threads(64)
# initialize params
sindex.set_parallel_mode(1)
sindex.set_prefetch_nthread(0)
sindex.set_omp_num_threads(64)
def ivecs_read(fname):
a = np.fromfile(fname, dtype='int32')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view('float32')
deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
xq = fvecs_read(deep1bdir + "deep1B_queries.fvecs")
gt_fname = deep1bdir + "deep1B_groundtruth.ivecs"
gt = ivecs_read(gt_fname)
for nprobe in 1, 10, 100, 1000:
sindex.set_nprobe(nprobe)
t0 = time.time()
D, I = sindex.search(xq, 100)
t1 = time.time()
print('nprobe=%d 1-recall@1=%.4f t=%.2fs' % (
nprobe, (I[:, 0] == gt[:, 0]).sum() / len(xq),
t1 - t0
))

View File

@@ -0,0 +1,117 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import time
import numpy as np
import faiss
import argparse
from multiprocessing.pool import ThreadPool
def ivecs_mmap(fname):
a = np.memmap(fname, dtype='int32', mode='r')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:]
def fvecs_mmap(fname):
return ivecs_mmap(fname).view('float32')
def produce_batches(args):
x = fvecs_mmap(args.input)
if args.i1 == -1:
args.i1 = len(x)
print("Iterating on vectors %d:%d from %s by batches of size %d" % (
args.i0, args.i1, args.input, args.bs))
for j0 in range(args.i0, args.i1, args.bs):
j1 = min(j0 + args.bs, args.i1)
yield np.arange(j0, j1), x[j0:j1]
def rate_limited_iter(l):
'a thread pre-processes the next element'
pool = ThreadPool(1)
res = None
def next_or_None():
try:
return next(l)
except StopIteration:
return None
while True:
res_next = pool.apply_async(next_or_None)
if res is not None:
res = res.get()
if res is None:
return
yield res
res = res_next
deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
workdir = "/checkpoint/matthijs/ondisk_distributed/"
def main():
parser = argparse.ArgumentParser(
description='make index for a subset of the data')
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('index type')
aa('--inputindex',
default=workdir + 'trained.faissindex',
help='empty input index to fill in')
aa('--nt', default=-1, type=int, help='nb of openmp threads to use')
group = parser.add_argument_group('db options')
aa('--input', default=deep1bdir + "base.fvecs")
aa('--bs', default=2**18, type=int,
help='batch size for db access')
aa('--i0', default=0, type=int, help='lower bound to index')
aa('--i1', default=-1, type=int, help='upper bound of vectors to index')
group = parser.add_argument_group('output')
aa('-o', default='/tmp/x', help='output index')
aa('--keepquantizer', default=False, action='store_true',
help='by default we remove the data from the quantizer to save space')
args = parser.parse_args()
print('args=', args)
print('start accessing data')
src = produce_batches(args)
print('loading index', args.inputindex)
index = faiss.read_index(args.inputindex)
if args.nt != -1:
faiss.omp_set_num_threads(args.nt)
t0 = time.time()
ntot = 0
for ids, x in rate_limited_iter(src):
print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
ntot += ids.size
index_ivf = faiss.extract_index_ivf(index)
print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor())
index_ivf.invlists.print_stats()
if not args.keepquantizer:
print('resetting quantizer content')
index_ivf = faiss.extract_index_ivf(index)
index_ivf.quantizer.reset()
print('store output', args.o)
faiss.write_index(index, args.o)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,52 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import faiss
deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
workdir = "/checkpoint/matthijs/ondisk_distributed/"
print('Load centroids')
centroids = np.load(workdir + '1M_centroids.npy')
ncent, d = centroids.shape
print('apply random rotation')
rrot = faiss.RandomRotationMatrix(d, d)
rrot.init(1234)
centroids = rrot.apply_py(centroids)
print('make HNSW index as quantizer')
quantizer = faiss.IndexHNSWFlat(d, 32)
quantizer.hnsw.efSearch = 1024
quantizer.hnsw.efConstruction = 200
quantizer.add(centroids)
print('build index')
index = faiss.IndexPreTransform(
rrot,
faiss.IndexIVFScalarQuantizer(
quantizer, d, ncent, faiss.ScalarQuantizer.QT_6bit
)
)
def ivecs_mmap(fname):
a = np.memmap(fname, dtype='int32', mode='r')
d = a[0]
return a.reshape(-1, d + 1)[:, 1:]
def fvecs_mmap(fname):
return ivecs_mmap(fname).view('float32')
print('finish training index')
xt = fvecs_mmap(deep1bdir + 'learn.fvecs')
xt = np.ascontiguousarray(xt[:256 * 1000], dtype='float32')
index.train(xt)
print('write output')
faiss.write_index(index, workdir + 'trained.faissindex')

View File

@@ -0,0 +1,96 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import faiss
import argparse
from multiprocessing.pool import ThreadPool
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--inputs', nargs='*', required=True,
help='input indexes to merge')
parser.add_argument('--l0', type=int, default=0)
parser.add_argument('--l1', type=int, default=-1)
parser.add_argument('--nt', default=-1,
help='nb threads')
parser.add_argument('--output', required=True,
help='output index filename')
parser.add_argument('--outputIL',
help='output invfile filename')
args = parser.parse_args()
if args.nt != -1:
print('set nb of threads to', args.nt)
ils = faiss.InvertedListsPtrVector()
ils_dont_dealloc = []
pool = ThreadPool(20)
def load_index(fname):
print("loading", fname)
try:
index = faiss.read_index(fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
except RuntimeError as e:
print('could not load %s: %s' % (fname, e))
return fname, None
print(" %d entries" % index.ntotal)
return fname, index
index0 = None
for _, index in pool.imap(load_index, args.inputs):
if index is None:
continue
index_ivf = faiss.extract_index_ivf(index)
il = faiss.downcast_InvertedLists(index_ivf.invlists)
index_ivf.invlists = None
il.this.own()
ils_dont_dealloc.append(il)
if (args.l0, args.l1) != (0, -1):
print('restricting to lists %d:%d' % (args.l0, args.l1))
# il = faiss.SliceInvertedLists(il, args.l0, args.l1)
il.crop_invlists(args.l0, args.l1)
ils_dont_dealloc.append(il)
ils.push_back(il)
if index0 is None:
index0 = index
print("loaded %d invlists" % ils.size())
if not args.outputIL:
args.outputIL = args.output + '_invlists'
il0 = ils.at(0)
il = faiss.OnDiskInvertedLists(
il0.nlist, il0.code_size,
args.outputIL)
print("perform merge")
ntotal = il.merge_from(ils.data(), ils.size(), True)
print("swap into index0")
index0_ivf = faiss.extract_index_ivf(index0)
index0_ivf.nlist = il0.nlist
index0_ivf.ntotal = index0.ntotal = ntotal
index0_ivf.invlists = il
index0_ivf.own_invlists = False
print("write", args.output)
faiss.write_index(index0, args.output)

View File

@@ -0,0 +1,263 @@
#! /bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
set -e
todo=$1
# other options can be transmitted
shift
# the training data of the Deep1B dataset
deep1bdir=/datasets01_101/simsearch/041218/deep1b
traindata=$deep1bdir/learn.fvecs
# this is for small tests
nvec=1000000
k=4000
# for the real run
# nvec=50000000
# k=1000000
# working directory for the real run
workdir=/checkpoint/matthijs/ondisk_distributed
mkdir -p $workdir/{vslices,hslices}
if [ -z "$todo" ]; then
echo "nothing to do"
exit 1
elif [ $todo == test_kmeans_0 ]; then
# non distributed baseline
python distributed_kmeans.py \
--indata $traindata --i1 $nvec \
--k $k
elif [ $todo == test_kmeans_1 ]; then
# using all the machine's GPUs
python distributed_kmeans.py \
--indata $traindata --i1 $nvec \
--k $k --gpu -1
elif [ $todo == test_kmeans_2 ]; then
# distrbuted run, with one local server per GPU
ngpu=$( echo /dev/nvidia? | wc -w )
baseport=12012
# kill background porcesses on output of this script
trap 'kill -HUP 0' 0
hostports=''
for((gpu=0;gpu<ngpu;gpu++)); do
# range of vectors to assign to each sever
i0=$((nvec * gpu / ngpu))
i1=$((nvec * (gpu + 1) / ngpu))
port=$(( baseport + gpu ))
echo "start server $gpu for range $i0:$i1"
python distributed_kmeans.py \
--indata $traindata \
--i0 $i0 --i1 $i1 \
--server --gpu $gpu \
--port $port --ipv4 &
hostports="$hostports localhost:$port"
done
# lame way of making sure all servers are running
sleep 5s
python distributed_kmeans.py \
--client --servers "$hostports" \
--k $k --ipv4
elif [ $todo == slurm_distributed_kmeans ]; then
nserv=5
srun -n$nserv \
--time=48:00:00 \
--cpus-per-task=40 --gres=gpu:4 --mem=100G \
--partition=priority --comment='priority is the only one that works' \
-l bash $( realpath $0 ) slurm_within_kmeans_server
elif [ $todo == slurm_within_kmeans_server ]; then
nserv=$SLURM_NPROCS
[ ! -z "$nserv" ] || (echo "should be run by slurm"; exit 1)
rank=$SLURM_PROCID
baseport=12012
i0=$((nvec * rank / nserv))
i1=$((nvec * (rank + 1) / nserv))
port=$(( baseport + rank ))
echo "host $(hostname) start server $rank for range $i0:$i1 port $port"
if [ $rank != 0 ]; then
python -u distributed_kmeans.py \
--indata $traindata \
--i0 $i0 --i1 $i1 \
--server --gpu -1 \
--port $port --ipv4
else
# master process
# kill background processes on output of this script
trap 'kill -HUP 0' 0
python -u distributed_kmeans.py \
--indata $traindata \
--i0 $i0 --i1 $i1 \
--server --gpu -1 \
--port $port --ipv4 &
# Slurm has a somewhat convoluted way of specifying the nodes
# assigned to each task. This is to parse the SLURM_TASKS_PER_NODE variable
function parse_tasks_per_node () {
local blocks=$1
for block in ${blocks//,/ }; do
if [ ${block/x/} != $block ]; then
tpn="${block%(*}"
repeat=${block#*x}
repeat=${repeat%?}
for((i=0;i<repeat;i++)); do
echo $tpn
done
else
echo $block
fi
done
}
hostports=""
port=$baseport
echo VARS $SLURM_TASKS_PER_NODE $SLURM_JOB_NODELIST
tasks_per_node=( $( parse_tasks_per_node $SLURM_TASKS_PER_NODE ) )
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
n=${#nodes[*]}
for((i=0;i<n;i++)); do
hostname=${nodes[i]}
for((j=0;j<tasks_per_node[i];j++)); do
hostports="$hostports $hostname:$port"
((port++))
done
done
echo HOSTPORTS $hostports
sleep 20s
# run client
python distributed_kmeans.py \
--client --servers "$hostports" \
--k $k --ipv4 "$@"
echo "Done, kill the job"
scancel $SLURM_JOBID
fi
elif [ $todo == deep1b_clustering ]; then
# also set nvec=500M and k=10M in the top of the file
nserv=20
srun -n$nserv \
--time=48:00:00 \
--cpus-per-task=40 --gres=gpu:4 --mem=100G \
--partition=priority --comment='priority is the only one that works' \
-l bash $( realpath $0 ) slurm_within_kmeans_server \
--out $workdir/1M_centroids.npy
elif [ $todo == make_index_vslices ]; then
# vslice: slice per database shards
nvec=1000000000
nslice=200
for((i=0;i<nslice;i++)); do
i0=$((nvec * i / nslice))
i1=$((nvec * (i + 1) / nslice))
# make the script to be run by sbatch
cat > $workdir/vslices/slice$i.bash <<EOF
#!/bin/bash
srun python -u make_index_vslice.py \
--inputindex $workdir/trained.faissindex \
--input $deep1bdir/base.fvecs \
--nt 40 \
--i0 $i0 --i1 $i1 \
-o $workdir/vslices/slice$i.faissindex
EOF
# specify resources for script and run it
sbatch -n1 \
--time=48:00:00 \
--cpus-per-task=40 --gres=gpu:0 --mem=200G \
--output=$workdir/vslices/slice$i.log \
--job-name=vslice$i.c \
$workdir/vslices/slice$i.bash
echo "logs in $workdir/vslices/slice$i.log"
done
elif [ $todo == make_index_hslices ]; then
# hslice: slice per inverted lists
nlist=1000000
nslice=50
for((i=0;i<nslice;i++)); do
i0=$((nlist * i / nslice))
i1=$((nlist * (i + 1) / nslice))
# make the script to be run by sbatch
cat > $workdir/hslices/slice$i.bash <<EOF
#!/bin/bash
srun python -u merge_to_ondisk.py \
--input $workdir/vslices/slice{0..199}.faissindex \
--nt 20 \
--l0 $i0 --l1 $i1 \
--output $workdir/hslices/slice$i.faissindex \
--outputIL $workdir/hslices/slice$i.invlists
EOF
# specify resources for script and run it
sbatch -n1 \
--time=48:00:00 \
--cpus-per-task=20 --gres=gpu:0 --mem=200G \
--output=$workdir/hslices/slice$i.log \
--job-name=hslice$i.a \
--constraint=pascal \
$workdir/hslices/slice$i.bash
echo "logs in $workdir/hslices/slice$i.log"
done
elif [ $todo == run_search_servers ]; then
nserv=3
srun -n$nserv \
--time=48:00:00 \
--cpus-per-task=64 --gres=gpu:0 --mem=100G \
--constraint=pascal \
--partition=priority --comment='priority is the only one that works' \
-l python -u search_server.py --port 12012
else
echo "unknown todo $todo"
exit 1
fi

View File

@@ -0,0 +1,222 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import time
from faiss.contrib import rpc
import combined_index
import argparse
############################################################
# Server implementation
############################################################
class MyServer(rpc.Server):
""" Assign version that can be exposed via RPC """
def __init__(self, s, index):
rpc.Server.__init__(self, s)
self.index = index
def __getattr__(self, f):
return getattr(self.index, f)
def main():
parser = argparse.ArgumentParser()
def aa(*args, **kwargs):
group.add_argument(*args, **kwargs)
group = parser.add_argument_group('server options')
aa('--port', default=12012, type=int, help='server port')
aa('--when_ready_dir', default=None,
help='store host:port to this file when ready')
aa('--ipv4', default=False, action='store_true', help='force ipv4')
aa('--rank', default=0, type=int,
help='rank used as index in the client table')
args = parser.parse_args()
when_ready = None
if args.when_ready_dir:
when_ready = '%s/%d' % (args.when_ready_dir, args.rank)
print('loading index')
index = combined_index.CombinedIndexDeep1B()
print('starting server')
rpc.run_server(
lambda s: MyServer(s, index),
args.port, report_to_file=when_ready,
v6=not args.ipv4)
if __name__ == '__main__':
main()
############################################################
# Client implementation
############################################################
from multiprocessing.pool import ThreadPool
import faiss
import numpy as np
class ResultHeap:
""" Combine query results from a sliced dataset (for k-nn search) """
def __init__(self, nq, k):
" nq: number of query vectors, k: number of results per query "
self.I = np.zeros((nq, k), dtype='int64')
self.D = np.zeros((nq, k), dtype='float32')
self.nq, self.k = nq, k
heaps = faiss.float_maxheap_array_t()
heaps.k = k
heaps.nh = nq
heaps.val = faiss.swig_ptr(self.D)
heaps.ids = faiss.swig_ptr(self.I)
heaps.heapify()
self.heaps = heaps
def add_batch_result(self, D, I, i0):
assert D.shape == (self.nq, self.k)
assert I.shape == (self.nq, self.k)
I += i0
self.heaps.addn_with_ids(
self.k, faiss.swig_ptr(D),
faiss.swig_ptr(I), self.k)
def finalize(self):
self.heaps.reorder()
def distribute_weights(weights, nbin):
""" assign a set of weights to a smaller set of bins to balance them """
nw = weights.size
o = weights.argsort()
bins = np.zeros(nbin)
assign = np.ones(nw, dtype=int)
for i in o[::-1]:
b = bins.argmin()
assign[i] = b
bins[b] += weights[i]
return bins, assign
class SplitPerListIndex:
"""manages a local index, that does the coarse quantization and a set
of sub_indexes. The sub_indexes search a subset of the inverted
lists. The SplitPerListIndex merges results from the sub-indexes"""
def __init__(self, index, sub_indexes):
self.index = index
self.code_size = faiss.extract_index_ivf(index.index).code_size
self.sub_indexes = sub_indexes
self.ni = len(self.sub_indexes)
# pool of threads. Each thread manages one sub-index.
self.pool = ThreadPool(self.ni)
self.verbose = False
def set_nprobe(self, nprobe):
self.index.set_nprobe(nprobe)
self.pool.map(
lambda i: self.sub_indexes[i].set_nprobe(nprobe),
range(self.ni)
)
def set_omp_num_threads(self, nt):
faiss.omp_set_num_threads(nt)
self.pool.map(
lambda idx: idx.set_omp_num_threads(nt),
self.sub_indexes
)
def set_parallel_mode(self, pm):
self.index.set_parallel_mode(pm)
self.pool.map(
lambda idx: idx.set_parallel_mode(pm),
self.sub_indexes
)
def set_prefetch_nthread(self, nt):
self.index.set_prefetch_nthread(nt)
self.pool.map(
lambda idx: idx.set_prefetch_nthread(nt),
self.sub_indexes
)
def balance_lists(self, list_nos):
big_il = self.index.big_il
weights = np.array([big_il.list_size(int(i))
for i in list_nos.ravel()])
bins, assign = distribute_weights(weights, self.ni)
if self.verbose:
print('bins weight range %d:%d total %d (%.2f MiB)' % (
bins.min(), bins.max(), bins.sum(),
bins.sum() * (self.code_size + 8) / 2 ** 20))
self.nscan = bins.sum()
return assign.reshape(list_nos.shape)
def search(self, x, k):
xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
assign = self.balance_lists(list_nos)
def do_query(i):
sub_index = self.sub_indexes[i]
list_nos_i = list_nos.copy()
list_nos_i[assign != i] = -1
t0 = time.time()
Di, Ii = sub_index.ivf_search_preassigned(
xqo, list_nos_i, coarse_dis, k)
#print(list_nos_i, Ii)
if self.verbose:
print('client %d: %.3f s' % (i, time.time() - t0))
return Di, Ii
rh = ResultHeap(x.shape[0], k)
for Di, Ii in self.pool.imap(do_query, range(self.ni)):
#print("ADD", Ii, rh.I)
rh.add_batch_result(Di, Ii, 0)
rh.finalize()
return rh.D, rh.I
def range_search(self, x, radius):
xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
assign = self.balance_lists(list_nos)
nq = len(x)
def do_query(i):
sub_index = self.sub_indexes[i]
list_nos_i = list_nos.copy()
list_nos_i[assign != i] = -1
t0 = time.time()
limi, Di, Ii = sub_index.ivf_range_search_preassigned(
xqo, list_nos_i, coarse_dis, radius)
if self.verbose:
print('slice %d: %.3f s' % (i, time.time() - t0))
return limi, Di, Ii
D = [[] for i in range(nq)]
I = [[] for i in range(nq)]
sizes = np.zeros(nq, dtype=int)
for lims, Di, Ii in self.pool.imap(do_query, range(self.ni)):
for i in range(nq):
l0, l1 = lims[i:i + 2]
D[i].append(Di[l0:l1])
I[i].append(Ii[l0:l1])
sizes[i] += l1 - l0
lims = np.zeros(nq + 1, dtype=int)
lims[1:] = np.cumsum(sizes)
D = np.hstack([j for i in D for j in i])
I = np.hstack([j for i in I for j in i])
return lims, D, I

View File

@@ -0,0 +1,88 @@
#! /usr/bin/env python2
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import print_function
import numpy as np
import time
import faiss
import sys
# Get command-line arguments
k = int(sys.argv[1])
ngpu = int(sys.argv[2])
# Load Leon's file format
def load_mnist(fname):
print("load", fname)
f = open(fname)
header = np.fromfile(f, dtype='int8', count=4*4)
header = header.reshape(4, 4)[:, ::-1].copy().view('int32')
print(header)
nim, xd, yd = [int(x) for x in header[1:]]
data = np.fromfile(f, count=nim * xd * yd,
dtype='uint8')
print(data.shape, nim, xd, yd)
data = data.reshape(nim, xd, yd)
return data
basedir = "/path/to/mnist/data"
x = load_mnist(basedir + 'mnist8m/mnist8m-patterns-idx3-ubyte')
print("reshape")
x = x.reshape(x.shape[0], -1).astype('float32')
def train_kmeans(x, k, ngpu):
"Runs kmeans on one or several GPUs"
d = x.shape[1]
clus = faiss.Clustering(d, k)
clus.verbose = True
clus.niter = 20
# otherwise the kmeans implementation sub-samples the training set
clus.max_points_per_centroid = 10000000
res = [faiss.StandardGpuResources() for i in range(ngpu)]
flat_config = []
for i in range(ngpu):
cfg = faiss.GpuIndexFlatConfig()
cfg.useFloat16 = False
cfg.device = i
flat_config.append(cfg)
if ngpu == 1:
index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
else:
indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
for i in range(ngpu)]
index = faiss.IndexReplicas()
for sub_index in indexes:
index.addIndex(sub_index)
# perform the training
clus.train(x, index)
centroids = faiss.vector_float_to_array(clus.centroids)
obj = faiss.vector_float_to_array(clus.obj)
print("final objective: %.4g" % obj[-1])
return centroids.reshape(k, d)
print("run")
t0 = time.time()
train_kmeans(x, k, ngpu)
t1 = time.time()
print("total runtime: %.3f s" % (t1 - t0))

View File

@@ -0,0 +1,25 @@
README for the link & code implementation
=========================================
What is this?
-------------
Link & code is an indexing method that combines HNSW indexing with
compression and exploits the neighborhood structure of the similarity
graph to improve the reconstruction. It is described in
```
@inproceedings{link_and_code,
author = {Matthijs Douze and Alexandre Sablayrolles and Herv\'e J\'egou},
title = {Link and code: Fast indexing with graphs and compact regression codes},
booktitle = {CVPR},
year = {2018}
}
```
ArXiV [here](https://arxiv.org/abs/1804.09996)
The necessary code for this paper was removed from Faiss in version 1.8.0.
For a functioning verinsion, use Faiss 1.7.4.

View File

@@ -0,0 +1,113 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "AutoTune_c.h"
#include <faiss/AutoTune.h>
#include <cstring>
#include "macros_impl.h"
using faiss::Index;
using faiss::ParameterRange;
using faiss::ParameterSpace;
const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
}
void faiss_ParameterRange_values(
FaissParameterRange* range,
double** p_values,
size_t* p_size) {
auto& values = reinterpret_cast<ParameterRange*>(range)->values;
*p_values = values.data();
*p_size = values.size();
}
int faiss_ParameterSpace_new(FaissParameterSpace** space) {
try {
auto new_space = new ParameterSpace();
*space = reinterpret_cast<FaissParameterSpace*>(new_space);
}
CATCH_AND_HANDLE
}
DEFINE_DESTRUCTOR(ParameterSpace)
size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace* space) {
return reinterpret_cast<const ParameterSpace*>(space)->n_combinations();
}
int faiss_ParameterSpace_combination_name(
const FaissParameterSpace* space,
size_t cno,
char* char_buffer,
size_t size) {
try {
auto rep = reinterpret_cast<const ParameterSpace*>(space)
->combination_name(cno);
strncpy(char_buffer, rep.c_str(), size);
}
CATCH_AND_HANDLE
}
int faiss_ParameterSpace_set_index_parameters(
const FaissParameterSpace* space,
FaissIndex* cindex,
const char* param_string) {
try {
auto index = reinterpret_cast<Index*>(cindex);
reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
index, param_string);
}
CATCH_AND_HANDLE
}
/// set a combination of parameters on an index
int faiss_ParameterSpace_set_index_parameters_cno(
const FaissParameterSpace* space,
FaissIndex* cindex,
size_t cno) {
try {
auto index = reinterpret_cast<Index*>(cindex);
reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
index, cno);
}
CATCH_AND_HANDLE
}
int faiss_ParameterSpace_set_index_parameter(
const FaissParameterSpace* space,
FaissIndex* cindex,
const char* name,
double value) {
try {
auto index = reinterpret_cast<Index*>(cindex);
reinterpret_cast<const ParameterSpace*>(space)->set_index_parameter(
index, name, value);
}
CATCH_AND_HANDLE
}
void faiss_ParameterSpace_display(const FaissParameterSpace* space) {
reinterpret_cast<const ParameterSpace*>(space)->display();
}
int faiss_ParameterSpace_add_range(
FaissParameterSpace* space,
const char* name,
FaissParameterRange** p_range) {
try {
ParameterRange& range =
reinterpret_cast<ParameterSpace*>(space)->add_range(name);
if (p_range) {
*p_range = reinterpret_cast<FaissParameterRange*>(&range);
}
}
CATCH_AND_HANDLE
}

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c -*-
#ifndef FAISS_AUTO_TUNE_C_H
#define FAISS_AUTO_TUNE_C_H
#include "Index_c.h"
#include "faiss_c.h"
#ifdef __cplusplus
extern "C" {
#endif
/// possible values of a parameter, sorted from least to most expensive/accurate
FAISS_DECLARE_CLASS(ParameterRange)
FAISS_DECLARE_GETTER(ParameterRange, const char*, name)
/// Getter for the values in the range. The output values are invalidated
/// upon any other modification of the range.
void faiss_ParameterRange_values(FaissParameterRange*, double**, size_t*);
/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
*/
FAISS_DECLARE_CLASS(ParameterSpace)
FAISS_DECLARE_DESTRUCTOR(ParameterSpace)
/// Parameter space default constructor
int faiss_ParameterSpace_new(FaissParameterSpace** space);
/// nb of combinations, = product of values sizes
size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace*);
/// get string representation of the combination
/// by writing it to the given character buffer.
/// A buffer size of 1000 ensures that the full name is collected.
int faiss_ParameterSpace_combination_name(
const FaissParameterSpace*,
size_t,
char*,
size_t);
/// set a combination of parameters described by a string
int faiss_ParameterSpace_set_index_parameters(
const FaissParameterSpace*,
FaissIndex*,
const char*);
/// set a combination of parameters on an index
int faiss_ParameterSpace_set_index_parameters_cno(
const FaissParameterSpace*,
FaissIndex*,
size_t);
/// set one of the parameters
int faiss_ParameterSpace_set_index_parameter(
const FaissParameterSpace*,
FaissIndex*,
const char*,
double);
/// print a description on stdout
void faiss_ParameterSpace_display(const FaissParameterSpace*);
/// add a new parameter (or return it if it exists)
int faiss_ParameterSpace_add_range(
FaissParameterSpace*,
const char*,
FaissParameterRange**);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,166 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(faiss_c_library LANGUAGES C CXX)
set(CMAKE_C_STANDARD 11)
set(FAISS_C_SRC
AutoTune_c.cpp
Clustering_c.cpp
IndexFlat_c.cpp
IndexIVFFlat_c.cpp
IndexIVF_c.cpp
IndexLSH_c.cpp
IndexPreTransform_c.cpp
VectorTransform_c.cpp
IndexShards_c.cpp
IndexReplicas_c.cpp
Index_c.cpp
IndexBinary_c.cpp
IndexScalarQuantizer_c.cpp
MetaIndexes_c.cpp
clone_index_c.cpp
error_impl.cpp
index_factory_c.cpp
index_io_c.cpp
impl/AuxIndexStructures_c.cpp
utils/distances_c.cpp
utils/utils_c.cpp
)
add_library(faiss_c ${FAISS_C_SRC})
target_link_libraries(faiss_c PRIVATE faiss)
add_library(faiss_c_avx2 ${FAISS_C_SRC})
target_link_libraries(faiss_c_avx2 PRIVATE faiss_avx2)
if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
set_target_properties(faiss_c_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
if(NOT WIN32)
target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
else()
# MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
# Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
# Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
# Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
target_compile_options(faiss_c_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
endif()
add_library(faiss_c_avx512 ${FAISS_C_SRC})
target_link_libraries(faiss_c_avx512 PRIVATE faiss_avx512)
if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
set_target_properties(faiss_c_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
if(NOT WIN32)
# All modern CPUs support F, CD, VL, DQ, BW extensions.
# Ref: https://en.wikipedia.org/wiki/AVX512
target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
else()
target_compile_options(faiss_c_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
endif()
add_library(faiss_c_avx512_spr ${FAISS_C_SRC})
target_link_libraries(faiss_c_avx512_spr PRIVATE faiss_avx512_spr)
if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
set_target_properties(faiss_c_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
if(NOT WIN32)
# Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
# Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
else()
target_compile_options(faiss_c_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
endif()
add_library(faiss_c_sve ${FAISS_C_SRC})
target_link_libraries(faiss_c_sve PRIVATE faiss_sve)
if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
set_target_properties(faiss_c_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
if(NOT WIN32)
if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
# Do nothing, expect SVE to be enabled by -march=native
elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
# Add +sve
target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
# No valid -march, so specify -march=armv8-a+sve as the default
target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
endif()
if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
# Do nothing, expect SVE to be enabled by -march=native
elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
# Add +sve
target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
# No valid -march, so specify -march=armv8-a+sve as the default
target_compile_options(faiss_c_sve PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
endif()
endif()
function(faiss_install_headers headers p)
foreach(h ${headers})
get_filename_component(f ${h} DIRECTORY)
install(FILES ${h}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${p}/${f}
)
endforeach()
endfunction()
file(GLOB FAISS_C_API_HEADERS
RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"*.h"
"impl/*.h"
"utils/*.h")
faiss_install_headers("${FAISS_C_API_HEADERS}" c_api)
install(TARGETS faiss_c
EXPORT faiss-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
if(FAISS_OPT_LEVEL STREQUAL "avx2")
install(TARGETS faiss_c_avx2
EXPORT faiss-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(FAISS_OPT_LEVEL STREQUAL "avx512")
install(TARGETS faiss_c_avx2 faiss_c_avx512
EXPORT faiss-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
install(TARGETS faiss_c_avx2 faiss_c_avx512_spr
EXPORT faiss-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(FAISS_OPT_LEVEL STREQUAL "sve")
install(TARGETS faiss_c_sve
EXPORT faiss-targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
target_link_libraries(example_c PRIVATE faiss_c)
if(FAISS_ENABLE_GPU)
if(FAISS_ENABLE_ROCM)
add_subdirectory(gpu-rocm)
else ()
add_subdirectory(gpu)
endif()
endif()

View File

@@ -0,0 +1,170 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "Clustering_c.h"
#include <faiss/Clustering.h>
#include <faiss/Index.h>
#include <vector>
#include "macros_impl.h"
extern "C" {
using faiss::Clustering;
using faiss::ClusteringIterationStats;
using faiss::ClusteringParameters;
using faiss::Index;
DEFINE_GETTER(Clustering, int, niter)
DEFINE_GETTER(Clustering, int, nredo)
DEFINE_GETTER(Clustering, int, verbose)
DEFINE_GETTER(Clustering, int, spherical)
DEFINE_GETTER(Clustering, int, int_centroids)
DEFINE_GETTER(Clustering, int, update_index)
DEFINE_GETTER(Clustering, int, frozen_centroids)
DEFINE_GETTER(Clustering, int, min_points_per_centroid)
DEFINE_GETTER(Clustering, int, max_points_per_centroid)
DEFINE_GETTER(Clustering, int, seed)
DEFINE_GETTER(Clustering, size_t, decode_block_size)
/// getter for d
DEFINE_GETTER(Clustering, size_t, d)
/// getter for k
DEFINE_GETTER(Clustering, size_t, k)
DEFINE_GETTER(ClusteringIterationStats, float, obj)
DEFINE_GETTER(ClusteringIterationStats, double, time)
DEFINE_GETTER(ClusteringIterationStats, double, time_search)
DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
ClusteringParameters d;
params->frozen_centroids = d.frozen_centroids;
params->max_points_per_centroid = d.max_points_per_centroid;
params->min_points_per_centroid = d.min_points_per_centroid;
params->niter = d.niter;
params->nredo = d.nredo;
params->seed = d.seed;
params->spherical = d.spherical;
params->int_centroids = d.int_centroids;
params->update_index = d.update_index;
params->verbose = d.verbose;
params->decode_block_size = d.decode_block_size;
}
// This conversion is required because the two types are not memory-compatible
inline ClusteringParameters from_faiss_c(
const FaissClusteringParameters* params) {
ClusteringParameters o;
o.frozen_centroids = params->frozen_centroids;
o.max_points_per_centroid = params->max_points_per_centroid;
o.min_points_per_centroid = params->min_points_per_centroid;
o.niter = params->niter;
o.nredo = params->nredo;
o.seed = params->seed;
o.spherical = params->spherical;
o.update_index = params->update_index;
o.int_centroids = params->int_centroids;
o.verbose = params->verbose;
o.decode_block_size = params->decode_block_size;
return o;
}
/// getter for centroids (size = k * d)
void faiss_Clustering_centroids(
FaissClustering* clustering,
float** centroids,
size_t* size) {
std::vector<float>& v =
reinterpret_cast<Clustering*>(clustering)->centroids;
if (centroids) {
*centroids = v.data();
}
if (size) {
*size = v.size();
}
}
/// getter for iteration stats
void faiss_Clustering_iteration_stats(
FaissClustering* clustering,
FaissClusteringIterationStats** iteration_stats,
size_t* size) {
std::vector<ClusteringIterationStats>& v =
reinterpret_cast<Clustering*>(clustering)->iteration_stats;
if (iteration_stats) {
*iteration_stats =
reinterpret_cast<FaissClusteringIterationStats*>(v.data());
}
if (size) {
*size = v.size();
}
}
/// the only mandatory parameters are k and d
int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k) {
try {
Clustering* c = new Clustering(d, k);
*p_clustering = reinterpret_cast<FaissClustering*>(c);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_Clustering_new_with_params(
FaissClustering** p_clustering,
int d,
int k,
const FaissClusteringParameters* cp) {
try {
Clustering* c = new Clustering(d, k, from_faiss_c(cp));
*p_clustering = reinterpret_cast<FaissClustering*>(c);
return 0;
}
CATCH_AND_HANDLE
}
/// Index is used during the assignment stage
int faiss_Clustering_train(
FaissClustering* clustering,
idx_t n,
const float* x,
FaissIndex* index) {
try {
reinterpret_cast<Clustering*>(clustering)
->train(n, x, *reinterpret_cast<Index*>(index));
return 0;
}
CATCH_AND_HANDLE
}
void faiss_Clustering_free(FaissClustering* clustering) {
delete reinterpret_cast<Clustering*>(clustering);
}
int faiss_kmeans_clustering(
size_t d,
size_t n,
size_t k,
const float* x,
float* centroids,
float* q_error) {
try {
float out = faiss::kmeans_clustering(d, n, k, x, centroids);
if (q_error) {
*q_error = out;
}
return 0;
}
CATCH_AND_HANDLE
}
}

View File

@@ -0,0 +1,138 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c -*-
#ifndef FAISS_CLUSTERING_C_H
#define FAISS_CLUSTERING_C_H
#include "Index_c.h"
#include "faiss_c.h"
#ifdef __cplusplus
extern "C" {
#endif
/** Class for the clustering parameters. Can be passed to the
* constructor of the Clustering object.
*/
typedef struct FaissClusteringParameters {
int niter; ///< clustering iterations
int nredo; ///< redo clustering this many times and keep best
int verbose; ///< (bool)
int spherical; ///< (bool) do we want normalized centroids?
int int_centroids; ///< (bool) round centroids coordinates to integer
int update_index; ///< (bool) update index after each iteration?
int frozen_centroids; ///< (bool) use the centroids provided as input and do
///< not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset
int seed; ///< seed for the random number generator
size_t decode_block_size; ///< how many vectors at a time to decode
} FaissClusteringParameters;
/// Sets the ClusteringParameters object with reasonable defaults
void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
/** clustering based on assignment - centroid update iterations
*
* The clustering is based on an Index object that assigns training
* points to the centroids. Therefore, at each iteration the centroids
* are added to the index.
*
* On output, the centroids table is set to the latest version
* of the centroids and they are also added to the index. If the
* centroids table it is not empty on input, it is also used for
* initialization.
*
* To do several clusterings, just call train() several times on
* different training sets, clearing the centroid table in between.
*/
FAISS_DECLARE_CLASS(Clustering)
FAISS_DECLARE_GETTER(Clustering, int, niter)
FAISS_DECLARE_GETTER(Clustering, int, nredo)
FAISS_DECLARE_GETTER(Clustering, int, verbose)
FAISS_DECLARE_GETTER(Clustering, int, spherical)
FAISS_DECLARE_GETTER(Clustering, int, int_centroids)
FAISS_DECLARE_GETTER(Clustering, int, update_index)
FAISS_DECLARE_GETTER(Clustering, int, frozen_centroids)
FAISS_DECLARE_GETTER(Clustering, int, min_points_per_centroid)
FAISS_DECLARE_GETTER(Clustering, int, max_points_per_centroid)
FAISS_DECLARE_GETTER(Clustering, int, seed)
FAISS_DECLARE_GETTER(Clustering, size_t, decode_block_size)
/// getter for d
FAISS_DECLARE_GETTER(Clustering, size_t, d)
/// getter for k
FAISS_DECLARE_GETTER(Clustering, size_t, k)
FAISS_DECLARE_CLASS(ClusteringIterationStats)
FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
/// getter for centroids (size = k * d)
void faiss_Clustering_centroids(
FaissClustering* clustering,
float** centroids,
size_t* size);
/// getter for iteration stats
void faiss_Clustering_iteration_stats(
FaissClustering* clustering,
FaissClusteringIterationStats** iteration_stats,
size_t* size);
/// the only mandatory parameters are k and d
int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
int faiss_Clustering_new_with_params(
FaissClustering** p_clustering,
int d,
int k,
const FaissClusteringParameters* cp);
int faiss_Clustering_train(
FaissClustering* clustering,
idx_t n,
const float* x,
FaissIndex* index);
void faiss_Clustering_free(FaissClustering* clustering);
/** simplified interface
*
* @param d dimension of the data
* @param n nb of training vectors
* @param k nb of output centroids
* @param x training set (size n * d)
* @param centroids output centroids (size k * d)
* @param q_error final quantization error
* @return error code
*/
int faiss_kmeans_clustering(
size_t d,
size_t n,
size_t k,
const float* x,
float* centroids,
float* q_error);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,104 @@
Faiss C API
===========
Faiss provides a pure C interface, which can subsequently be used either in pure C programs or to produce bindings for programming languages with Foreign Function Interface (FFI) support. Although this is not required for the Python interface, some other programming languages (e.g. Rust and Julia) do not have SWIG support.
Compilation instructions
------------------------
The full contents of the pure C API are in the ["c_api"](c_api/) folder.
Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
Include `-DFAISS_ENABLE_C_API=ON` to the cmake command.
`make -C build`
This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library.
To build the example program, you should run `make -C build example_c` at the top level of
the faiss repo. The example program will be in `build/c_api/example_c` .
Using the API
-------------
The C API is composed of:
- A set of C header files comprising the main Faiss interfaces, converted for use in C. Each file follows the format `«name»_c.h`, where `«name»` is the respective name from the C++ API. For example, the file [Index_c.h](./Index_c.h) file corresponds to the base `Index` API. Functions are declared with the `faiss_` prefix (e.g. `faiss_IndexFlat_new`), whereas new types have the `Faiss` prefix (e.g. `FaissIndex`, `FaissMetricType`, ...).
- A dynamic library, compiled from the sources in the same folder, encloses the implementation of the library and wrapper functions.
The index factory is available via the `faiss_index_factory` function in `AutoTune_c.h`:
```c
FaissIndex* index = NULL;
int c = faiss_index_factory(&index, 64, "Flat", METRIC_L2);
if (c) {
// operation failed
}
```
Most operations that you would find as member functions are available with the format `faiss_«classname»_«member»`.
```c
idx_t ntotal = faiss_Index_ntotal(index);
```
Since this is C, the index needs to be freed manually in the end:
```c
faiss_Index_free(index);
```
Error handling is done by examining the error code returned by operations with recoverable errors.
The code identifies the type of exception that rose from the implementation. Fetching the
corresponding error message can be done by calling the function `faiss_get_last_error()` from
`error_c.h`. Getter functions and `free` functions do not return an error code.
```c
int c = faiss_Index_add(index, nb, xb);
if (c) {
printf("%s", faiss_get_last_error());
exit(-1);
}
```
An example is included, which is built automatically for the target `all`. It can also be built separately:
`make bin/example_c`
Building with GPU support
-------------------------
For GPU support, a separate dynamic library in the "c_api/gpu" directory needs to be built.
`make`
The "gpufaiss_c" dynamic library contains the GPU and CPU implementations of Faiss, which means that
it can be used in place of "faiss_c". The same library will dynamically link with the CUDA runtime
and cuBLAS.
Using the GPU with the C API
----------------------------
A standard GPU resources object can be obtained by the name `FaissStandardGpuResources`:
```c
FaissStandardGpuResources* gpu_res = NULL;
int c = faiss_StandardGpuResources_new(&gpu_res);
if (c) {
printf("%s", faiss_get_last_error());
exit(-1);
}
```
Similarly to the C++ API, a CPU index can be converted to a GPU index:
```c
FaissIndex* cpu_index = NULL;
int c = faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2);
if (c) { /* ... */ }
FaissGpuIndex* gpu_index = NULL;
c = faiss_index_cpu_to_gpu(gpu_res, 0, cpu_index, &gpu_index);
if (c) { /* ... */ }
```
A more complete example is available by the name `bin/example_gpu_c`.

View File

@@ -0,0 +1,142 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "IndexBinary_c.h"
#include <faiss/IndexBinary.h>
#include "macros_impl.h"
extern "C" {
DEFINE_DESTRUCTOR(IndexBinary)
DEFINE_GETTER(IndexBinary, int, d)
DEFINE_GETTER(IndexBinary, int, is_trained)
DEFINE_GETTER(IndexBinary, idx_t, ntotal)
DEFINE_GETTER(IndexBinary, FaissMetricType, metric_type)
DEFINE_GETTER(IndexBinary, int, verbose);
DEFINE_SETTER(IndexBinary, int, verbose);
int faiss_IndexBinary_train(
FaissIndexBinary* index,
idx_t n,
const uint8_t* x) {
try {
reinterpret_cast<faiss::IndexBinary*>(index)->train(n, x);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x) {
try {
reinterpret_cast<faiss::IndexBinary*>(index)->add(n, x);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_add_with_ids(
FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
const idx_t* xids) {
try {
reinterpret_cast<faiss::IndexBinary*>(index)->add_with_ids(n, x, xids);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_search(
const FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
idx_t k,
int32_t* distances,
idx_t* labels) {
try {
reinterpret_cast<const faiss::IndexBinary*>(index)->search(
n, x, k, distances, labels);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_range_search(
const FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
int radius,
FaissRangeSearchResult* result) {
try {
reinterpret_cast<const faiss::IndexBinary*>(index)->range_search(
n,
x,
radius,
reinterpret_cast<faiss::RangeSearchResult*>(result));
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_assign(
FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
idx_t* labels,
idx_t k) {
try {
reinterpret_cast<faiss::IndexBinary*>(index)->assign(n, x, labels, k);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_reset(FaissIndexBinary* index) {
try {
reinterpret_cast<faiss::IndexBinary*>(index)->reset();
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_remove_ids(
FaissIndexBinary* index,
const FaissIDSelector* sel,
size_t* n_removed) {
try {
size_t n{reinterpret_cast<faiss::IndexBinary*>(index)->remove_ids(
*reinterpret_cast<const faiss::IDSelector*>(sel))};
if (n_removed) {
*n_removed = n;
}
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_reconstruct(
const FaissIndexBinary* index,
idx_t key,
uint8_t* recons) {
try {
reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct(
key, recons);
}
CATCH_AND_HANDLE
}
int faiss_IndexBinary_reconstruct_n(
const FaissIndexBinary* index,
idx_t i0,
idx_t ni,
uint8_t* recons) {
try {
reinterpret_cast<const faiss::IndexBinary*>(index)->reconstruct_n(
i0, ni, recons);
}
CATCH_AND_HANDLE
}
}

View File

@@ -0,0 +1,169 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c -*-
#ifndef FAISS_INDEX_BINARY_C_H
#define FAISS_INDEX_BINARY_C_H
#include <stddef.h>
#include "Index_c.h"
#include "faiss_c.h"
#ifdef __cplusplus
extern "C" {
#endif
// forward declaration required here
FAISS_DECLARE_CLASS(RangeSearchResult)
// typedef struct FaissRangeSearchResult_H FaissRangeSearchResult;
typedef struct FaissIDSelector_H FaissIDSelector;
/// Opaque type for referencing to a binary index object
FAISS_DECLARE_CLASS(IndexBinary)
FAISS_DECLARE_DESTRUCTOR(IndexBinary)
/// Getter for d
FAISS_DECLARE_GETTER(IndexBinary, int, d)
/// Getter for is_trained
FAISS_DECLARE_GETTER(IndexBinary, int, is_trained)
/// Getter for ntotal
FAISS_DECLARE_GETTER(IndexBinary, idx_t, ntotal)
/// Getter for metric_type
FAISS_DECLARE_GETTER(IndexBinary, FaissMetricType, metric_type)
FAISS_DECLARE_GETTER_SETTER(IndexBinary, int, verbose)
/** Perform training on a representative set of vectors
*
* @param index opaque pointer to index object
* @param n nb of training vectors
* @param x training vectors, size n * d
*/
int faiss_IndexBinary_train(FaissIndexBinary* index, idx_t n, const uint8_t* x);
/** Add n vectors of dimension d to the index.
*
* Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
* This function slices the input vectors in chunks smaller than
* blocksize_add and calls add_core.
* @param index opaque pointer to index object
* @param x input matrix, size n * d
*/
int faiss_IndexBinary_add(FaissIndexBinary* index, idx_t n, const uint8_t* x);
/** Same as add, but stores xids instead of sequential ids.
*
* The default implementation fails with an assertion, as it is
* not supported by all indexes.
*
* @param index opaque pointer to index object
* @param xids if non-null, ids to store for the vectors (size n)
*/
int faiss_IndexBinary_add_with_ids(
FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
const idx_t* xids);
/** query n vectors of dimension d to the index.
*
* return at most k vectors. If there are not enough results for a
* query, the result array is padded with -1s.
*
* @param index opaque pointer to index object
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
* @param distances output pairwise distances, size n*k
*/
int faiss_IndexBinary_search(
const FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
idx_t k,
int32_t* distances,
idx_t* labels);
/** query n vectors of dimension d to the index.
*
* return all vectors with distance < radius. Note that many
* indexes do not implement the range_search (only the k-NN search
* is mandatory).
*
* @param index opaque pointer to index object
* @param x input vectors to search, size n * d
* @param radius search radius
* @param result result table
*/
int faiss_IndexBinary_range_search(
const FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
int radius,
FaissRangeSearchResult* result);
/** return the indexes of the k vectors closest to the query x.
*
* This function is identical as search but only return labels of neighbors.
* @param index opaque pointer to index object
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
*/
int faiss_IndexBinary_assign(
FaissIndexBinary* index,
idx_t n,
const uint8_t* x,
idx_t* labels,
idx_t k);
/** removes all elements from the database.
* @param index opaque pointer to index object
*/
int faiss_IndexBinary_reset(FaissIndexBinary* index);
/** removes IDs from the index. Not supported by all indexes
* @param index opaque pointer to index object
* @param nremove output for the number of IDs removed
*/
int faiss_IndexBinary_remove_ids(
FaissIndexBinary* index,
const FaissIDSelector* sel,
size_t* n_removed);
/** Reconstruct a stored vector (or an approximation if lossy coding)
*
* this function may not be defined for some indexes
* @param index opaque pointer to index object
* @param key id of the vector to reconstruct
* @param recons reconstructed vector (size d)
*/
int faiss_IndexBinary_reconstruct(
const FaissIndexBinary* index,
idx_t key,
uint8_t* recons);
/** Reconstruct vectors i0 to i0 + ni - 1
*
* this function may not be defined for some indexes
* @param index opaque pointer to index object
* @param recons reconstructed vector (size ni * d)
*/
int faiss_IndexBinary_reconstruct_n(
const FaissIndexBinary* index,
idx_t i0,
idx_t ni,
uint8_t* recons);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,165 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "IndexFlat_c.h"
#include <faiss/IndexFlat.h>
#include <faiss/IndexRefine.h>
#include "macros_impl.h"
extern "C" {
using faiss::Index;
using faiss::IndexFlat;
using faiss::IndexFlat1D;
using faiss::IndexFlatIP;
using faiss::IndexFlatL2;
using faiss::IndexRefineFlat;
DEFINE_DESTRUCTOR(IndexFlat)
DEFINE_INDEX_DOWNCAST(IndexFlat)
int faiss_IndexFlat_new(FaissIndexFlat** p_index) {
try {
*p_index = reinterpret_cast<FaissIndexFlat*>(new IndexFlat());
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexFlat_new_with(
FaissIndexFlat** p_index,
idx_t d,
FaissMetricType metric) {
try {
IndexFlat* index =
new IndexFlat(d, static_cast<faiss::MetricType>(metric));
*p_index = reinterpret_cast<FaissIndexFlat*>(index);
return 0;
}
CATCH_AND_HANDLE
}
void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size) {
IndexFlat* indexf = reinterpret_cast<IndexFlat*>(index);
*p_xb = indexf->get_xb();
if (p_size) {
*p_size = indexf->codes.size() / sizeof(float);
}
}
int faiss_IndexFlat_compute_distance_subset(
FaissIndex* index,
idx_t n,
const float* x,
idx_t k,
float* distances,
const idx_t* labels) {
try {
reinterpret_cast<IndexFlat*>(index)->compute_distance_subset(
n, x, k, distances, labels);
return 0;
}
CATCH_AND_HANDLE
}
DEFINE_DESTRUCTOR(IndexFlatIP)
DEFINE_INDEX_DOWNCAST(IndexFlatIP)
int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index) {
try {
IndexFlatIP* index = new IndexFlatIP();
*p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d) {
try {
IndexFlatIP* index = new IndexFlatIP(d);
*p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
return 0;
}
CATCH_AND_HANDLE
}
DEFINE_DESTRUCTOR(IndexFlatL2)
DEFINE_INDEX_DOWNCAST(IndexFlatL2)
int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index) {
try {
IndexFlatL2* index = new IndexFlatL2();
*p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d) {
try {
IndexFlatL2* index = new IndexFlatL2(d);
*p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexRefineFlat_new(
FaissIndexRefineFlat** p_index,
FaissIndex* base_index) {
try {
IndexRefineFlat* index = new IndexRefineFlat(
reinterpret_cast<faiss::Index*>(base_index));
*p_index = reinterpret_cast<FaissIndexRefineFlat*>(index);
return 0;
}
CATCH_AND_HANDLE
}
DEFINE_DESTRUCTOR(IndexRefineFlat)
DEFINE_INDEX_DOWNCAST(IndexRefineFlat)
DEFINE_GETTER(IndexRefineFlat, int, own_fields)
DEFINE_SETTER(IndexRefineFlat, int, own_fields)
DEFINE_GETTER(IndexRefineFlat, float, k_factor)
DEFINE_SETTER(IndexRefineFlat, float, k_factor)
DEFINE_DESTRUCTOR(IndexFlat1D)
DEFINE_INDEX_DOWNCAST(IndexFlat1D)
int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index) {
try {
IndexFlat1D* index = new IndexFlat1D();
*p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexFlat1D_new_with(
FaissIndexFlat1D** p_index,
int continuous_update) {
try {
IndexFlat1D* index =
new IndexFlat1D(static_cast<bool>(continuous_update));
*p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
return 0;
}
CATCH_AND_HANDLE
}
int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index) {
try {
reinterpret_cast<IndexFlat1D*>(index)->update_permutation();
return 0;
}
CATCH_AND_HANDLE
}
}

View File

@@ -0,0 +1,129 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c -*-
#ifndef FAISS_INDEX_FLAT_C_H
#define FAISS_INDEX_FLAT_C_H
#include "Index_c.h"
#include "faiss_c.h"
#ifdef __cplusplus
extern "C" {
#endif
// forward declaration
typedef enum FaissMetricType FaissMetricType;
/** Opaque type for IndexFlat */
FAISS_DECLARE_CLASS_INHERITED(IndexFlat, Index)
int faiss_IndexFlat_new(FaissIndexFlat** p_index);
int faiss_IndexFlat_new_with(
FaissIndexFlat** p_index,
idx_t d,
FaissMetricType metric);
/** get a pointer to the index's internal data (the `xb` field). The outputs
* become invalid after any data addition or removal operation.
*
* @param index opaque pointer to index object
* @param p_xb output, the pointer to the beginning of `xb`.
* @param p_size output, the current size of `sb` in number of float values.
*/
void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size);
/** attempt a dynamic cast to a flat index, thus checking
* check whether the underlying index type is `IndexFlat`.
*
* @param index opaque pointer to index object
* @return the same pointer if the index is a flat index, NULL otherwise
*/
FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat)
FAISS_DECLARE_DESTRUCTOR(IndexFlat)
/** compute distance with a subset of vectors
*
* @param index opaque pointer to index object
* @param x query vectors, size n * d
* @param labels indices of the vectors that should be compared
* for each query vector, size n * k
* @param distances
* corresponding output distances, size n * k
*/
int faiss_IndexFlat_compute_distance_subset(
FaissIndex* index,
idx_t n,
const float* x,
idx_t k,
float* distances,
const idx_t* labels);
/** Opaque type for IndexFlatIP */
FAISS_DECLARE_CLASS_INHERITED(IndexFlatIP, Index)
FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatIP)
FAISS_DECLARE_DESTRUCTOR(IndexFlatIP)
int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index);
int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d);
/** Opaque type for IndexFlatL2 */
FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2, Index)
FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatL2)
FAISS_DECLARE_DESTRUCTOR(IndexFlatL2)
int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index);
int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d);
/** Opaque type for IndexRefineFlat
*
* Index that queries in a base_index (a fast one) and refines the
* results with an exact search, hopefully improving the results.
*/
FAISS_DECLARE_CLASS_INHERITED(IndexRefineFlat, Index)
int faiss_IndexRefineFlat_new(
FaissIndexRefineFlat** p_index,
FaissIndex* base_index);
FAISS_DECLARE_DESTRUCTOR(IndexRefineFlat)
FAISS_DECLARE_INDEX_DOWNCAST(IndexRefineFlat)
FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, int, own_fields)
/// factor between k requested in search and the k requested from
/// the base_index (should be >= 1)
FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, float, k_factor)
/** Opaque type for IndexFlat1D
*
* optimized version for 1D "vectors"
*/
FAISS_DECLARE_CLASS_INHERITED(IndexFlat1D, Index)
FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat1D)
FAISS_DECLARE_DESTRUCTOR(IndexFlat1D)
int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index);
int faiss_IndexFlat1D_new_with(
FaissIndexFlat1D** p_index,
int continuous_update);
int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include "IndexIVFFlat_c.h"
#include <faiss/IndexIVFFlat.h>
#include "Clustering_c.h"
#include "Index_c.h"
#include "macros_impl.h"
using faiss::Index;
using faiss::IndexIVFFlat;
using faiss::MetricType;
DEFINE_DESTRUCTOR(IndexIVFFlat)
DEFINE_INDEX_DOWNCAST(IndexIVFFlat)
/// number of possible key values
DEFINE_GETTER(IndexIVFFlat, size_t, nlist)
/// number of probes at query time
DEFINE_GETTER(IndexIVFFlat, size_t, nprobe)
DEFINE_SETTER(IndexIVFFlat, size_t, nprobe)
/// quantizer that maps vectors to inverted lists
DEFINE_GETTER_PERMISSIVE(IndexIVFFlat, FaissIndex*, quantizer)
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
DEFINE_GETTER(IndexIVFFlat, char, quantizer_trains_alone)
/// whether object owns the quantizer
DEFINE_GETTER(IndexIVFFlat, int, own_fields)
DEFINE_SETTER(IndexIVFFlat, int, own_fields)
int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index) {
try {
*p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat());
}
CATCH_AND_HANDLE
}
int faiss_IndexIVFFlat_new_with(
FaissIndexIVFFlat** p_index,
FaissIndex* quantizer,
size_t d,
size_t nlist) {
try {
auto q = reinterpret_cast<Index*>(quantizer);
*p_index = reinterpret_cast<FaissIndexIVFFlat*>(
new IndexIVFFlat(q, d, nlist));
}
CATCH_AND_HANDLE
}
int faiss_IndexIVFFlat_new_with_metric(
FaissIndexIVFFlat** p_index,
FaissIndex* quantizer,
size_t d,
size_t nlist,
FaissMetricType metric) {
try {
auto q = reinterpret_cast<Index*>(quantizer);
auto m = static_cast<MetricType>(metric);
*p_index = reinterpret_cast<FaissIndexIVFFlat*>(
new IndexIVFFlat(q, d, nlist, m));
}
CATCH_AND_HANDLE
}
int faiss_IndexIVFFlat_add_core(
FaissIndexIVFFlat* index,
idx_t n,
const float* x,
const idx_t* xids,
const int64_t* precomputed_idx) {
try {
reinterpret_cast<IndexIVFFlat*>(index)->add_core(
n, x, xids, precomputed_idx);
}
CATCH_AND_HANDLE
}
int faiss_IndexIVFFlat_update_vectors(
FaissIndexIVFFlat* index,
int nv,
idx_t* idx,
const float* v) {
try {
reinterpret_cast<IndexIVFFlat*>(index)->update_vectors(nv, idx, v);
}
CATCH_AND_HANDLE
}

Some files were not shown because too many files have changed in this diff Show More