diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 144e858..cfcf1ad 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -306,6 +306,23 @@ class LeannBuilder: def build_index(self, index_path: str): if not self.chunks: raise ValueError("No chunks added.") + + # Filter out invalid/empty text chunks early to keep passage and embedding counts aligned + valid_chunks: list[dict[str, Any]] = [] + skipped = 0 + for chunk in self.chunks: + text = chunk.get("text", "") + if isinstance(text, str) and text.strip(): + valid_chunks.append(chunk) + else: + skipped += 1 + if skipped > 0: + print( + f"Warning: Skipping {skipped} empty/invalid text chunk(s). Processing {len(valid_chunks)} valid chunks" + ) + self.chunks = valid_chunks + if not self.chunks: + raise ValueError("All provided chunks are empty or invalid. Nothing to index.") if self.dimensions is None: self.dimensions = len( compute_embeddings( diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index 9cce58c..ee038d5 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -244,6 +244,16 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray: except ImportError as e: raise ImportError(f"OpenAI package not installed: {e}") + # Validate input list + if not texts: + raise ValueError("Cannot compute embeddings for empty text list") + # Extra validation: abort early if any item is empty/whitespace + invalid_count = sum(1 for t in texts if not isinstance(t, str) or not t.strip()) + if invalid_count > 0: + raise ValueError( + f"Found {invalid_count} empty/invalid text(s) in input. Upstream should filter before calling OpenAI." + ) + api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise RuntimeError("OPENAI_API_KEY environment variable not set")