Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.
This commit is contained in:
Andy Lee
2025-12-23 09:04:20 +00:00
parent 8a2ea37871
commit d83a463c26
18 changed files with 83 additions and 47 deletions

View File

@@ -28,9 +28,30 @@ jobs:
run: |
uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
type-check:
name: Type Check with ty
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Install uv and Python
uses: astral-sh/setup-uv@v6
with:
python-version: '3.11'
- name: Install ty
run: uv tool install ty
- name: Run ty type checker
run: |
# Run ty on core packages and apps, excluding multimodal and tests
ty check --exclude "apps/multimodal/**" --exclude "tests/**" packages/leann-core/src apps
build:
needs: lint
needs: [lint, type-check]
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy:
matrix:

View File

@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
import argparse
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Union
from typing import Any
import dotenv
from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
pass
@abstractmethod
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
pass
def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
return config
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
"""Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
print(f"\n[Building Index] Creating {self.name} index...")

View File

@@ -6,6 +6,7 @@ Supports Chrome browser history.
import os
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample):
return profiles
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load browser history and convert to text chunks."""
# Determine Chrome profiles
if args.chrome_profile and not args.auto_find_profiles:

View File

@@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load ChatGPT export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -5,6 +5,7 @@ Supports Claude export data from JSON files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Claude export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -6,6 +6,7 @@ optimized chunking parameters.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample):
help="Try to preserve import statements in chunks (default: True)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load code files and convert to AST-aware chunks."""
print(f"🔍 Scanning code repository: {args.repo_dir}")
print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample):
if not repo_path.exists():
raise ValueError(f"Repository directory not found: {args.repo_dir}")
# Load code files with filtering
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
"required_exts": args.include_extensions,
"exclude_hidden": True,
}
# Create exclusion filter
def file_filter(file_path: str) -> bool:
"""Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample):
# Load documents with file filtering
documents = SimpleDirectoryReader(
args.repo_dir,
file_extractor=None, # Use default extractors
**reader_kwargs,
file_extractor=None,
recursive=True,
encoding="utf-8",
required_exts=args.include_extensions,
exclude_hidden=True,
).load_data(show_progress=True)
# Apply custom filtering

View File

@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
import sys
from pathlib import Path
from typing import Any, Union
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
help="Enable AST-aware chunking for code files in the data directory",
)
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}")
if args.file_types:
@@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample):
raise ValueError(f"Data directory not found: {args.data_dir}")
# Load documents
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
}
if args.file_types:
reader_kwargs["required_exts"] = args.file_types
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
show_progress=True
)
documents = SimpleDirectoryReader(
args.data_dir,
recursive=True,
encoding="utf-8",
required_exts=args.file_types if args.file_types else None,
).load_data(show_progress=True)
if not documents:
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")

View File

@@ -5,6 +5,7 @@ Supports Apple Mail on macOS.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample):
return messages_dirs
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load emails and convert to text chunks."""
# Determine mail directories
if args.mail_path:

View File

@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
text=True,
timeout=5,
)
return result.returncode == 0 and result.stdout.strip()
return result.returncode == 0 and bool(result.stdout.strip())
except Exception:
return False
@@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader):
return concatenated_groups
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]:
"""
Create concatenated content from a group of messages.

View File

@@ -14,6 +14,7 @@ import argparse
import pickle
import tempfile
from pathlib import Path
from typing import Any
import numpy as np
from PIL import Image
@@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample):
help="Batch size for CLIP embedding generation (default: 32)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load images, generate CLIP embeddings, and return text descriptions."""
self._image_data = self._load_images_and_embeddings(args)
return [entry["text"] for entry in self._image_data]

View File

@@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio
import asyncio
from pathlib import Path
from typing import Any
from leann.chunking_utils import create_text_chunks
@@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample):
help="Overlap between text chunks (default: 200)",
)
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load iMessage history and convert to text chunks."""
print("Loading iMessage conversation history...")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample):
print("4. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Slack messages via MCP server."""
print(f"Connecting to Slack MCP server: {args.mcp_server}")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample):
print("5. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Twitter bookmarks via MCP server."""
print(f"Connecting to Twitter MCP server: {args.mcp_server}")

View File

@@ -6,6 +6,7 @@ Supports WeChat chat history export and search.
import subprocess
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample):
print(f"Export error: {e}")
return False
async def load_data(self, args) -> list[str]:
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load WeChat history and convert to text chunks."""
# Initialize WeChat reader with export capabilities
reader = WeChatHistoryReader()

View File

@@ -239,8 +239,8 @@ def create_ast_chunks(
chunks = chunk_builder.chunkify(code_content)
for chunk in chunks:
chunk_text = None
astchunk_metadata = {}
chunk_text: str | None = None
astchunk_metadata: dict[str, Any] = {}
if hasattr(chunk, "text"):
chunk_text = chunk.text

View File

@@ -19,7 +19,7 @@ from .settings import (
)
def extract_pdf_text_with_pymupdf(file_path: str) -> str:
def extract_pdf_text_with_pymupdf(file_path: str) -> str | None:
"""Extract text from PDF using PyMuPDF for better quality."""
try:
import fitz # PyMuPDF
@@ -35,7 +35,7 @@ def extract_pdf_text_with_pymupdf(file_path: str) -> str:
return None
def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
def extract_pdf_text_with_pdfplumber(file_path: str) -> str | None:
"""Extract text from PDF using pdfplumber for better quality."""
try:
import pdfplumber

View File

@@ -11,14 +11,15 @@ from pathlib import Path
from typing import Callable, Optional
# Try to import readline with fallback for Windows
HAS_READLINE = False
readline = None # type: ignore[assignment]
try:
import readline
import readline # type: ignore[no-redef]
HAS_READLINE = True
except ImportError:
# Windows doesn't have readline by default
HAS_READLINE = False
readline = None
pass
class InteractiveSession:

View File

@@ -157,6 +157,19 @@ exclude = ["localhost", "127.0.0.1", "example.com"]
exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"]
scheme = ["https", "http"]
[tool.ty]
# Type checking with ty (Astral's fast Python type checker)
# ty is 10-100x faster than mypy. See: https://docs.astral.sh/ty/
[tool.ty.environment]
python-version = "3.11"
extra-paths = ["apps", "packages/leann-core/src"]
[tool.ty.rules]
# Disable some noisy rules that have many false positives
possibly-missing-attribute = "ignore"
unresolved-import = "ignore" # Many optional dependencies
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]