From f1aca0f75604aedc1616d771de81aca3e25ea15a Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 15 Aug 2025 17:28:35 -0700 Subject: [PATCH] fix(core): skip empty/invalid chunks before embedding; guard OpenAI embeddings Avoid 400 errors from OpenAI when chunker yields empty strings by filtering invalid texts in LeannBuilder.build_index. Add validation fail-fast in OpenAI embedding path to surface upstream issues earlier. Keeps passages and embeddings aligned during build. Refs #54 --- packages/leann-core/src/leann/api.py | 17 +++++++++++++++++ .../leann-core/src/leann/embedding_compute.py | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 144e858..cfcf1ad 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -306,6 +306,23 @@ class LeannBuilder: def build_index(self, index_path: str): if not self.chunks: raise ValueError("No chunks added.") + + # Filter out invalid/empty text chunks early to keep passage and embedding counts aligned + valid_chunks: list[dict[str, Any]] = [] + skipped = 0 + for chunk in self.chunks: + text = chunk.get("text", "") + if isinstance(text, str) and text.strip(): + valid_chunks.append(chunk) + else: + skipped += 1 + if skipped > 0: + print( + f"Warning: Skipping {skipped} empty/invalid text chunk(s). Processing {len(valid_chunks)} valid chunks" + ) + self.chunks = valid_chunks + if not self.chunks: + raise ValueError("All provided chunks are empty or invalid. Nothing to index.") if self.dimensions is None: self.dimensions = len( compute_embeddings( diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index 9cce58c..ee038d5 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -244,6 +244,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray: except ImportError as e: raise ImportError(f"OpenAI package not installed: {e}") + # Validate input list + if not texts: + raise ValueError("Cannot compute embeddings for empty text list") + # Extra validation: abort early if any item is empty/whitespace + invalid_count = sum(1 for t in texts if not isinstance(t, str) or not t.strip()) + if invalid_count > 0: + raise ValueError( + f"Found {invalid_count} empty/invalid text(s) in input. Upstream should filter before calling OpenAI." + ) + api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY environment variable not set")