Fix: handle dict format from create_text_chunks (introduced in PR #157)

PR #157 changed create_text_chunks() to return list[dict] instead of
list[str] to preserve metadata, but base_rag_example.py was not updated
to handle the new format. This caused all chunks to fail validation
with "All provided chunks are empty or invalid".
This commit is contained in:
Andy Lee
2025-12-23 08:50:31 +00:00
parent 7ddb4772c0
commit 8a2ea37871
2 changed files with 15 additions and 8 deletions

View File

@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
import argparse import argparse
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Union
import dotenv import dotenv
from leann.api import LeannBuilder, LeannChat from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
pass pass
@abstractmethod @abstractmethod
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load data from the source. Returns list of text chunks.""" """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
pass pass
def get_llm_config(self, args) -> dict[str, Any]: def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
return config return config
async def build_index(self, args, texts: list[str]) -> str: async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
"""Build LEANN index from texts.""" """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
print(f"\n[Building Index] Creating {self.name} index...") print(f"\n[Building Index] Creating {self.name} index...")
@@ -314,8 +314,14 @@ class BaseRAGExample(ABC):
batch_size = 1000 batch_size = 1000
for i in range(0, len(texts), batch_size): for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size] batch = texts[i : i + batch_size]
for text in batch: for item in batch:
builder.add_text(text) # Handle both dict format (from create_text_chunks) and plain strings
if isinstance(item, dict):
text = item.get("text", "")
metadata = item.get("metadata")
builder.add_text(text, metadata)
else:
builder.add_text(item)
print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...") print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
print("Building index structure...") print("Building index structure...")

View File

@@ -5,6 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any, Union
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -51,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
help="Enable AST-aware chunking for code files in the data directory", help="Enable AST-aware chunking for code files in the data directory",
) )
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load documents and convert to text chunks.""" """Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}") print(f"Loading documents from: {args.data_dir}")
if args.file_types: if args.file_types: