From 8a2ea37871c6466788ac5e24dc28ea0415ba6d10 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Tue, 23 Dec 2025 08:50:31 +0000
Subject: [PATCH] Fix: handle dict format from create_text_chunks (introduced
 in PR #157)

PR #157 changed create_text_chunks() to return list[dict] instead of
list[str] to preserve metadata, but base_rag_example.py was not updated
to handle the new format. This caused all chunks to fail validation
with "All provided chunks are empty or invalid".
---
 apps/base_rag_example.py | 20 +++++++++++++-------
 apps/document_rag.py     |  3 ++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py
index e67ee56..f695610 100644
--- a/apps/base_rag_example.py
+++ b/apps/base_rag_example.py
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
 import argparse
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any
+from typing import Any, Union
 
 import dotenv
 from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
         pass
 
     @abstractmethod
-    async def load_data(self, args) -> list[str]:
-        """Load data from the source. Returns list of text chunks."""
+    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
+        """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
         pass
 
     def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
 
         return config
 
-    async def build_index(self, args, texts: list[str]) -> str:
-        """Build LEANN index from texts."""
+    async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
+        """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
         index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
 
         print(f"\n[Building Index] Creating {self.name} index...")
@@ -314,8 +314,14 @@ class BaseRAGExample(ABC):
         batch_size = 1000
         for i in range(0, len(texts), batch_size):
             batch = texts[i : i + batch_size]
-            for text in batch:
-                builder.add_text(text)
+            for item in batch:
+                # Handle both dict format (from create_text_chunks) and plain strings
+                if isinstance(item, dict):
+                    text = item.get("text", "")
+                    metadata = item.get("metadata")
+                    builder.add_text(text, metadata)
+                else:
+                    builder.add_text(item)
             print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
 
         print("Building index structure...")
diff --git a/apps/document_rag.py b/apps/document_rag.py
index 8472f6f..280d0fb 100644
--- a/apps/document_rag.py
+++ b/apps/document_rag.py
@@ -5,6 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
 
 import sys
 from pathlib import Path
+from typing import Any, Union
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -51,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
             help="Enable AST-aware chunking for code files in the data directory",
         )
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
         if args.file_types: