From 8a2ea37871c6466788ac5e24dc28ea0415ba6d10 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 23 Dec 2025 08:50:31 +0000 Subject: [PATCH] Fix: handle dict format from create_text_chunks (introduced in PR #157) PR #157 changed create_text_chunks() to return list[dict] instead of list[str] to preserve metadata, but base_rag_example.py was not updated to handle the new format. This caused all chunks to fail validation with "All provided chunks are empty or invalid". --- apps/base_rag_example.py | 20 +++++++++++++------- apps/document_rag.py | 3 ++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py index e67ee56..f695610 100644 --- a/apps/base_rag_example.py +++ b/apps/base_rag_example.py @@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples. import argparse from abc import ABC, abstractmethod from pathlib import Path -from typing import Any +from typing import Any, Union import dotenv from leann.api import LeannBuilder, LeannChat @@ -257,8 +257,8 @@ class BaseRAGExample(ABC): pass @abstractmethod - async def load_data(self, args) -> list[str]: - """Load data from the source. Returns list of text chunks.""" + async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: + """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key).""" pass def get_llm_config(self, args) -> dict[str, Any]: @@ -282,8 +282,8 @@ class BaseRAGExample(ABC): return config - async def build_index(self, args, texts: list[str]) -> str: - """Build LEANN index from texts.""" + async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str: + """Build LEANN index from texts (accepts strings or dicts with 'text' key).""" index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") print(f"\n[Building Index] Creating {self.name} index...") @@ -314,8 +314,14 @@ class BaseRAGExample(ABC): batch_size = 1000 for i in range(0, len(texts), batch_size): batch = texts[i : i + batch_size] - for text in batch: - builder.add_text(text) + for item in batch: + # Handle both dict format (from create_text_chunks) and plain strings + if isinstance(item, dict): + text = item.get("text", "") + metadata = item.get("metadata") + builder.add_text(text, metadata) + else: + builder.add_text(item) print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...") print("Building index structure...") diff --git a/apps/document_rag.py b/apps/document_rag.py index 8472f6f..280d0fb 100644 --- a/apps/document_rag.py +++ b/apps/document_rag.py @@ -5,6 +5,7 @@ Supports PDF, TXT, MD, and other document formats. import sys from pathlib import Path +from typing import Any, Union # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -51,7 +52,7 @@ class DocumentRAG(BaseRAGExample): help="Enable AST-aware chunking for code files in the data directory", ) - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: """Load documents and convert to text chunks.""" print(f"Loading documents from: {args.data_dir}") if args.file_types: