From f1aca0f75604aedc1616d771de81aca3e25ea15a Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Fri, 15 Aug 2025 17:28:35 -0700
Subject: [PATCH] fix(core): skip empty/invalid chunks before embedding; guard
 OpenAI embeddings

Avoid 400 errors from OpenAI when chunker yields empty strings by filtering
invalid texts in LeannBuilder.build_index. Add validation fail-fast in
OpenAI embedding path to surface upstream issues earlier. Keeps passages and
embeddings aligned during build.

Refs #54
---
 packages/leann-core/src/leann/api.py            | 17 +++++++++++++++++
 .../leann-core/src/leann/embedding_compute.py   | 10 ++++++++++
 2 files changed, 27 insertions(+)

diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index 144e858..cfcf1ad 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -306,6 +306,23 @@ class LeannBuilder:
     def build_index(self, index_path: str):
         if not self.chunks:
             raise ValueError("No chunks added.")
+
+        # Filter out invalid/empty text chunks early to keep passage and embedding counts aligned
+        valid_chunks: list[dict[str, Any]] = []
+        skipped = 0
+        for chunk in self.chunks:
+            text = chunk.get("text", "")
+            if isinstance(text, str) and text.strip():
+                valid_chunks.append(chunk)
+            else:
+                skipped += 1
+        if skipped > 0:
+            print(
+                f"Warning: Skipping {skipped} empty/invalid text chunk(s). Processing {len(valid_chunks)} valid chunks"
+            )
+            self.chunks = valid_chunks
+            if not self.chunks:
+                raise ValueError("All provided chunks are empty or invalid. Nothing to index.")
         if self.dimensions is None:
             self.dimensions = len(
                 compute_embeddings(
diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py
index 9cce58c..ee038d5 100644
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -244,6 +244,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
     except ImportError as e:
         raise ImportError(f"OpenAI package not installed: {e}")
 
+    # Validate input list
+    if not texts:
+        raise ValueError("Cannot compute embeddings for empty text list")
+    # Extra validation: abort early if any item is empty/whitespace
+    invalid_count = sum(1 for t in texts if not isinstance(t, str) or not t.strip())
+    if invalid_count > 0:
+        raise ValueError(
+            f"Found {invalid_count} empty/invalid text(s) in input. Upstream should filter before calling OpenAI."
+        )
+
     api_key = os.getenv("OPENAI_API_KEY")
     if not api_key:
         raise RuntimeError("OPENAI_API_KEY environment variable not set")