From e588100674394a0cd45b454802e149967c99ac20 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 29 Sep 2025 20:43:16 -0700 Subject: [PATCH 1/2] fix: set ntotal for storage as well (#129) --- packages/leann-core/src/leann/api.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 1c8ab55..07d8373 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -735,6 +735,20 @@ class LeannBuilder: storage_index = faiss.IndexFlatL2(index.d) index.storage = storage_index index.own_fields = True + # Faiss expects storage.ntotal to reflect the existing graph's + # population (even if the vectors themselves were pruned from disk + # for recompute mode). When we attach a fresh IndexFlat here its + # ntotal starts at zero, which later causes IndexHNSW::add to + # believe new "preset" levels were provided and trips the + # `n0 + n == levels.size()` assertion. Seed the temporary storage + # with the current ntotal so Faiss maintains the proper offset for + # incoming vectors. + try: + storage_index.ntotal = index.ntotal + except AttributeError: + # Older Faiss builds may not expose ntotal as a writable + # attribute; in that case we fall back to the default behaviour. + pass if index.d != embedding_dim: raise ValueError( f"Existing index dimension ({index.d}) does not match new embeddings ({embedding_dim})." From e2b37914ce4ae0fbbe87eefd08eba8cb22131631 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Tue, 30 Sep 2025 00:48:46 -0700 Subject: [PATCH 2/2] add dynamic add test --- examples/dynamic_update_no_recompute.py | 6 +++++- packages/leann-backend-hnsw/third_party/faiss | 2 +- packages/leann-core/src/leann/api.py | 11 ++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/examples/dynamic_update_no_recompute.py b/examples/dynamic_update_no_recompute.py index 8641d38..84362dc 100644 --- a/examples/dynamic_update_no_recompute.py +++ b/examples/dynamic_update_no_recompute.py @@ -43,7 +43,11 @@ from apps.chunking import create_text_chunks REPO_ROOT = Path(__file__).resolve().parents[1] DEFAULT_QUERY = "What's LEANN?" -DEFAULT_INITIAL_FILES = [REPO_ROOT / "data" / "2501.14312v1 (1).pdf"] +DEFAULT_INITIAL_FILES = [ + REPO_ROOT / "data" / "2501.14312v1 (1).pdf", + REPO_ROOT / "data" / "huawei_pangu.md", + REPO_ROOT / "data" / "PrideandPrejudice.txt", +] DEFAULT_UPDATE_FILES = [REPO_ROOT / "data" / "2506.08276v1.pdf"] diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index ed96ff7..1d51f0c 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit ed96ff7dbaea0562b994f8ce7823af41884b1010 +Subproject commit 1d51f0c07420808a18f85a4db6636fd25e4a1daa diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 07d8373..0c18526 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -728,6 +728,7 @@ class LeannBuilder: index = faiss.read_index(str(index_file)) if hasattr(index, "is_recompute"): index.is_recompute = needs_recompute + print(f"index.is_recompute: {index.is_recompute}") if getattr(index, "storage", None) is None: if index.metric_type == faiss.METRIC_INNER_PRODUCT: storage_index = faiss.IndexFlatIP(index.d) @@ -760,7 +761,15 @@ class LeannBuilder: chunk.setdefault("metadata", {})["id"] = new_id chunk["id"] = new_id - index.add(embeddings.shape[0], faiss.swig_ptr(embeddings)) + if needs_recompute: + # sequengtially add embeddings + for i in range(embeddings.shape[0]): + print(f"add {i} embeddings") + index.add(1, faiss.swig_ptr(embeddings[i : i + 1])) + else: + index.add(embeddings.shape[0], faiss.swig_ptr(embeddings)) + + # index.add(embeddings.shape[0], faiss.swig_ptr(embeddings)) faiss.write_index(index, str(index_file)) with open(passages_file, "a", encoding="utf-8") as f: