Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.
This commit is contained in:
Andy Lee
2025-12-23 09:04:20 +00:00
parent 8a2ea37871
commit d83a463c26
18 changed files with 83 additions and 47 deletions

View File

@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
import argparse
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Union
from typing import Any
import dotenv
from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
pass
@abstractmethod
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
pass
def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
return config
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
"""Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
print(f"\n[Building Index] Creating {self.name} index...")

View File

@@ -6,6 +6,7 @@ Supports Chrome browser history.
import os
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample):
return profiles
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load browser history and convert to text chunks."""
# Determine Chrome profiles
if args.chrome_profile and not args.auto_find_profiles:

View File

@@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load ChatGPT export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -5,6 +5,7 @@ Supports Claude export data from JSON files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Claude export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -6,6 +6,7 @@ optimized chunking parameters.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample):
help="Try to preserve import statements in chunks (default: True)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load code files and convert to AST-aware chunks."""
print(f"🔍 Scanning code repository: {args.repo_dir}")
print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample):
if not repo_path.exists():
raise ValueError(f"Repository directory not found: {args.repo_dir}")
# Load code files with filtering
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
"required_exts": args.include_extensions,
"exclude_hidden": True,
}
# Create exclusion filter
def file_filter(file_path: str) -> bool:
"""Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample):
# Load documents with file filtering
documents = SimpleDirectoryReader(
args.repo_dir,
file_extractor=None, # Use default extractors
**reader_kwargs,
file_extractor=None,
recursive=True,
encoding="utf-8",
required_exts=args.include_extensions,
exclude_hidden=True,
).load_data(show_progress=True)
# Apply custom filtering

View File

@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
import sys
from pathlib import Path
from typing import Any, Union
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
help="Enable AST-aware chunking for code files in the data directory",
)
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}")
if args.file_types:
@@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample):
raise ValueError(f"Data directory not found: {args.data_dir}")
# Load documents
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
}
if args.file_types:
reader_kwargs["required_exts"] = args.file_types
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
show_progress=True
)
documents = SimpleDirectoryReader(
args.data_dir,
recursive=True,
encoding="utf-8",
required_exts=args.file_types if args.file_types else None,
).load_data(show_progress=True)
if not documents:
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")

View File

@@ -5,6 +5,7 @@ Supports Apple Mail on macOS.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample):
return messages_dirs
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load emails and convert to text chunks."""
# Determine mail directories
if args.mail_path:

View File

@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
text=True,
timeout=5,
)
return result.returncode == 0 and result.stdout.strip()
return result.returncode == 0 and bool(result.stdout.strip())
except Exception:
return False
@@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader):
return concatenated_groups
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]:
"""
Create concatenated content from a group of messages.

View File

@@ -14,6 +14,7 @@ import argparse
import pickle
import tempfile
from pathlib import Path
from typing import Any
import numpy as np
from PIL import Image
@@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample):
help="Batch size for CLIP embedding generation (default: 32)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load images, generate CLIP embeddings, and return text descriptions."""
self._image_data = self._load_images_and_embeddings(args)
return [entry["text"] for entry in self._image_data]

View File

@@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio
import asyncio
from pathlib import Path
from typing import Any
from leann.chunking_utils import create_text_chunks
@@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample):
help="Overlap between text chunks (default: 200)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load iMessage history and convert to text chunks."""
print("Loading iMessage conversation history...")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample):
print("4. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Slack messages via MCP server."""
print(f"Connecting to Slack MCP server: {args.mcp_server}")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample):
print("5. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Twitter bookmarks via MCP server."""
print(f"Connecting to Twitter MCP server: {args.mcp_server}")

View File

@@ -6,6 +6,7 @@ Supports WeChat chat history export and search.
import subprocess
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample):
print(f"Export error: {e}")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load WeChat history and convert to text chunks."""
# Initialize WeChat reader with export capabilities
reader = WeChatHistoryReader()