Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/README.md
+++ b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/README.md
@@ -0,0 +1,6 @@
+# The Torch contrib
+
+This contrib directory contains a few Pytorch routines that
+are useful for similarity search. They do not necessarily depend on Faiss.
+
+The code is designed to work with CPU and GPU tensors.
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/init.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/init.py
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/clustering.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/clustering.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This contrib module contains Pytorch code for k-means clustering
+"""
+import faiss
+import faiss.contrib.torch_utils
+import torch
+
+# the kmeans can produce both torch and numpy centroids
+from faiss.contrib.clustering import kmeans
+
+
+class DatasetAssign:
+    """Wrapper for a tensor that offers a function to assign the vectors
+    to centroids. All other implementations offer the same interface"""
+
+    def __init__(self, x):
+        self.x = x
+
+    def count(self):
+        return self.x.shape[0]
+
+    def dim(self):
+        return self.x.shape[1]
+
+    def get_subset(self, indices):
+        return self.x[indices]
+
+    def perform_search(self, centroids):
+        return faiss.knn(self.x, centroids, 1)
+
+    def assign_to(self, centroids, weights=None):
+        D, I = self.perform_search(centroids)
+
+        I = I.ravel()
+        D = D.ravel()
+        nc, d = centroids.shape
+
+        sum_per_centroid = torch.zeros_like(centroids)
+        if weights is None:
+            sum_per_centroid.index_add_(0, I, self.x)
+        else:
+            sum_per_centroid.index_add_(0, I, self.x * weights[:, None])
+
+        # the indices are still in numpy.
+        return I.cpu().numpy(), D, sum_per_centroid
+
+
+class DatasetAssignGPU(DatasetAssign):
+
+    def __init__(self, res, x):
+        DatasetAssign.__init__(self, x)
+        self.res = res
+
+    def perform_search(self, centroids):
+        return faiss.knn_gpu(self.res, self.x, centroids, 1)
--- a/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/quantization.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/contrib/torch/quantization.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This contrib module contains Pytorch code for quantization.
+"""
+
+import torch
+import faiss
+import math
+from faiss.contrib.torch import clustering
+# the kmeans can produce both torch and numpy centroids
+
+
+class Quantizer:
+
+    def __init__(self, d, code_size):
+        """
+        d: dimension of vectors
+        code_size: nb of bytes of the code (per vector)
+        """
+        self.d = d
+        self.code_size = code_size
+
+    def train(self, x):
+        """
+        takes a n-by-d array and peforms training
+        """
+        pass
+
+    def encode(self, x):
+        """
+        takes a n-by-d float array, encodes to an n-by-code_size uint8 array
+        """
+        pass
+
+    def decode(self, codes):
+        """
+        takes a n-by-code_size uint8 array, returns a n-by-d array
+        """
+        pass
+
+
+class VectorQuantizer(Quantizer):
+
+    def __init__(self, d, k):
+
+        code_size = int(math.ceil(torch.log2(k) / 8))
+        Quantizer.__init__(d, code_size)
+        self.k = k
+
+    def train(self, x):
+        pass
+
+
+class ProductQuantizer(Quantizer):
+    def __init__(self, d, M, nbits):
+        """ M: number of subvectors, d%M == 0
+        nbits: number of bits that each vector is encoded into
+        """
+        assert d % M == 0
+        assert nbits == 8  # todo: implement other nbits values
+        code_size = int(math.ceil(M * nbits / 8))
+        Quantizer.__init__(self, d, code_size)
+        self.M = M
+        self.nbits = nbits
+        self.code_size = code_size
+
+    def train(self, x):
+        nc = 2 ** self.nbits
+        sd = self.d // self.M
+        dev = x.device
+        dtype = x.dtype
+        self.codebook = torch.zeros((self.M, nc, sd), device=dev, dtype=dtype)
+        for m in range(self.M):
+            xsub = x[:, m * self.d // self.M: (m + 1) * self.d // self.M]
+            data = clustering.DatasetAssign(xsub.contiguous())
+            self.codebook[m] = clustering.kmeans(2 ** self.nbits, data)
+
+    def encode(self, x):
+        codes = torch.zeros((x.shape[0], self.code_size), dtype=torch.uint8)
+        for m in range(self.M):
+            xsub = x[:, m * self.d // self.M:(m + 1) * self.d // self.M]
+            _, I = faiss.knn(xsub.contiguous(), self.codebook[m], 1)
+            codes[:, m] = I.ravel()
+        return codes
+
+    def decode(self, codes):
+        idxs = [codes[:, m].long() for m in range(self.M)]
+        vectors = [self.codebook[m, idxs[m], :] for m in range(self.M)]
+        stacked_vectors = torch.stack(vectors, dim=1)
+        cbd = self.codebook.shape[-1]
+        x_rec = stacked_vectors.reshape(-1, cbd * self.M)
+        return x_rec