Fix: handle dict format from create_text_chunks (introduced in PR #157)

PR #157 changed create_text_chunks() to return list[dict] instead of list[str] to preserve metadata, but base_rag_example.py was not updated to handle the new format. This caused all chunks to fail validation with "All provided chunks are empty or invalid".
2025-12-23 08:50:31 +00:00
parent 7ddb4772c0
commit 8a2ea37871
2 changed files with 15 additions and 8 deletions
--- a/apps/base_rag_example.py
+++ b/apps/base_rag_example.py
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
 import argparse
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any
+from typing import Any, Union

 import dotenv
 from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
        pass

    @abstractmethod
-    async def load_data(self, args) -> list[str]:
-        """Load data from the source. Returns list of text chunks."""
+    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
+        """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
        pass

    def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):

        return config

-    async def build_index(self, args, texts: list[str]) -> str:
-        """Build LEANN index from texts."""
+    async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
+        """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
        index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")

        print(f"\n[Building Index] Creating {self.name} index...")
@@ -314,8 +314,14 @@ class BaseRAGExample(ABC):
        batch_size = 1000
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
-            for text in batch:
-                builder.add_text(text)
+            for item in batch:
+                # Handle both dict format (from create_text_chunks) and plain strings
+                if isinstance(item, dict):
+                    text = item.get("text", "")
+                    metadata = item.get("metadata")
+                    builder.add_text(text, metadata)
+                else:
+                    builder.add_text(item)
            print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")

        print("Building index structure...")
--- a/apps/document_rag.py
+++ b/apps/document_rag.py
@@ -5,6 +5,7 @@ Supports PDF, TXT, MD, and other document formats.

 import sys
 from pathlib import Path
+from typing import Any, Union

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -51,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
            help="Enable AST-aware chunking for code files in the data directory",
        )

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
        """Load documents and convert to text chunks."""
        print(f"Loading documents from: {args.data_dir}")
        if args.file_types: