Fix: handle dict format from create_text_chunks (introduced in PR #157)
PR #157 changed create_text_chunks() to return list[dict] instead of list[str] to preserve metadata, but base_rag_example.py was not updated to handle the new format. This caused all chunks to fail validation with "All provided chunks are empty or invalid".
This commit is contained in:
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
|
|||||||
import argparse
|
import argparse
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any, Union
|
||||||
|
|
||||||
import dotenv
|
import dotenv
|
||||||
from leann.api import LeannBuilder, LeannChat
|
from leann.api import LeannBuilder, LeannChat
|
||||||
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def load_data(self, args) -> list[str]:
|
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
|
||||||
"""Load data from the source. Returns list of text chunks."""
|
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_llm_config(self, args) -> dict[str, Any]:
|
def get_llm_config(self, args) -> dict[str, Any]:
|
||||||
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
|
|||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
async def build_index(self, args, texts: list[str]) -> str:
|
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
|
||||||
"""Build LEANN index from texts."""
|
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
|
||||||
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
||||||
|
|
||||||
print(f"\n[Building Index] Creating {self.name} index...")
|
print(f"\n[Building Index] Creating {self.name} index...")
|
||||||
@@ -314,8 +314,14 @@ class BaseRAGExample(ABC):
|
|||||||
batch_size = 1000
|
batch_size = 1000
|
||||||
for i in range(0, len(texts), batch_size):
|
for i in range(0, len(texts), batch_size):
|
||||||
batch = texts[i : i + batch_size]
|
batch = texts[i : i + batch_size]
|
||||||
for text in batch:
|
for item in batch:
|
||||||
builder.add_text(text)
|
# Handle both dict format (from create_text_chunks) and plain strings
|
||||||
|
if isinstance(item, dict):
|
||||||
|
text = item.get("text", "")
|
||||||
|
metadata = item.get("metadata")
|
||||||
|
builder.add_text(text, metadata)
|
||||||
|
else:
|
||||||
|
builder.add_text(item)
|
||||||
print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
|
print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
|
||||||
|
|
||||||
print("Building index structure...")
|
print("Building index structure...")
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
# Add parent directory to path for imports
|
# Add parent directory to path for imports
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
@@ -51,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
|
|||||||
help="Enable AST-aware chunking for code files in the data directory",
|
help="Enable AST-aware chunking for code files in the data directory",
|
||||||
)
|
)
|
||||||
|
|
||||||
async def load_data(self, args) -> list[str]:
|
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
|
||||||
"""Load documents and convert to text chunks."""
|
"""Load documents and convert to text chunks."""
|
||||||
print(f"Loading documents from: {args.data_dir}")
|
print(f"Loading documents from: {args.data_dir}")
|
||||||
if args.file_types:
|
if args.file_types:
|
||||||
|
|||||||
Reference in New Issue
Block a user