Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/1-Flat.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/1-Flat.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                   # make faiss available
+index = faiss.IndexFlatL2(d)   # build the index
+print(index.is_trained)
+index.add(xb)                  # add vectors to the index
+print(index.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = index.search(xb[:5], k) # sanity check
+print(I)
+print(D)
+D, I = index.search(xq, k)     # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/2-IVFFlat.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/2-IVFFlat.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss
+
+nlist = 100
+k = 4
+quantizer = faiss.IndexFlatL2(d)  # the other index
+index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
+# here we specify METRIC_L2, by default it performs inner-product search
+
+assert not index.is_trained
+index.train(xb)
+assert index.is_trained
+
+index.add(xb)                  # add may be a bit slower as well
+D, I = index.search(xq, k)     # actual search
+print(I[-5:])                  # neighbors of the 5 last queries
+index.nprobe = 10              # default nprobe is 1, try a few more
+D, I = index.search(xq, k)
+print(I[-5:])                  # neighbors of the 5 last queries
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/3-IVFPQ.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/3-IVFPQ.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss
+
+nlist = 100
+m = 8
+k = 4
+quantizer = faiss.IndexFlatL2(d)  # this remains the same
+index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
+                                  # 8 specifies that each sub-vector is encoded as 8 bits
+index.train(xb)
+index.add(xb)
+D, I = index.search(xb[:5], k) # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/4-GPU.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/4-GPU.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                     # make faiss available
+
+res = faiss.StandardGpuResources()  # use a single GPU
+
+## Using a flat index
+
+index_flat = faiss.IndexFlatL2(d)  # build a flat (CPU) index
+
+# make it a flat GPU index
+gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
+
+gpu_index_flat.add(xb)         # add vectors to the index
+print(gpu_index_flat.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index_flat.search(xq, k)  # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
+
+
+## Using an IVF index
+
+nlist = 100
+quantizer = faiss.IndexFlatL2(d)  # the other index
+index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
+# here we specify METRIC_L2, by default it performs inner-product search
+
+# make it an IVF GPU index
+gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
+
+assert not gpu_index_ivf.is_trained
+gpu_index_ivf.train(xb)        # add vectors to the index
+assert gpu_index_ivf.is_trained
+
+gpu_index_ivf.add(xb)          # add vectors to the index
+print(gpu_index_ivf.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index_ivf.search(xq, k)  # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/5-Multiple-GPUs.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/5-Multiple-GPUs.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+import faiss                     # make faiss available
+
+ngpus = faiss.get_num_gpus()
+
+print("number of GPUs:", ngpus)
+
+cpu_index = faiss.IndexFlatL2(d)
+
+gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
+    cpu_index
+)
+
+gpu_index.add(xb)              # add vectors to the index
+print(gpu_index.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index.search(xq, k) # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/7-PQFastScan.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/7-PQFastScan.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8   # 8 specifies that the number of sub-vector is 8
+k = 4   # number of dimension in etracted vector
+n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
+# construct FastScan Index
+
+assert not index.is_trained
+index.train(xb)     # Train vectors data index within mockup database
+assert index.is_trained
+
+index.add(xb)
+D, I = index.search(xb[:5], k)  # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])               # neighbors of the 5 last queries
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/8-PQFastScanRefine.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/8-PQFastScanRefine.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8  # 8 specifies that the number of sub-vector is 8
+k = 4  # number of dimension in etracted vector
+n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
+
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
+index_refine = faiss.IndexRefineFlat(index)
+# construct FastScan and run index refinement
+
+assert not index_refine.is_trained
+index_refine.train(xb)  # Train vectors data index within mockup database
+assert index_refine.is_trained
+
+index_refine.add(xb)
+params = faiss.IndexRefineSearchParameters(k_factor=3)
+D, I = index_refine.search(xq[:5], 10, params=params)
+print(I)
+print(D)
+index.nprobe = 10  # make comparable with experiment above
+D, I = index.search(xq[:5], k)  # search
+print(I[-5:])
--- a/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/9-RefineComparison.py
+++ b/packages/leann-backend-hnsw/third_party/faiss/tutorial/python/9-RefineComparison.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+
+from faiss.contrib.evaluation import knn_intersection_measure
+from faiss.contrib import datasets
+
+# 64-dim vectors, 50000 vectors in the training, 100000 in database,
+# 10000 in queries, dtype ('float32')
+ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
+d = 64                           # dimension
+
+# Constructing the refine PQ index with SQfp16 with index factory
+index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
+index_fp16.train(ds.get_train())
+index_fp16.add(ds.get_database())
+
+# Constructing the refine PQ index with SQ8
+index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
+index_sq8.train(ds.get_train())
+index_sq8.add(ds.get_database())
+
+# Parameterization on k factor while doing search for index refinement
+k_factor = 3.0
+params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
+
+# Perform index search using different index refinement
+D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
+D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
+
+# Calculating knn intersection measure for different index types on refinement
+KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
+KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
+
+# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
+assert (KIM_fp16 > KIM_sq8)
+
+print(I_sq8[:5])
+print(I_fp16[:5])