Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.
This commit is contained in:
Andy Lee
2025-12-23 09:04:20 +00:00
parent 8a2ea37871
commit d83a463c26
18 changed files with 83 additions and 47 deletions

View File

@@ -28,9 +28,30 @@ jobs:
run: | run: |
uv run --only-group lint pre-commit run --all-files --show-diff-on-failure uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
type-check:
name: Type Check with ty
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Install uv and Python
uses: astral-sh/setup-uv@v6
with:
python-version: '3.11'
- name: Install ty
run: uv tool install ty
- name: Run ty type checker
run: |
# Run ty on core packages and apps, excluding multimodal and tests
ty check --exclude "apps/multimodal/**" --exclude "tests/**" packages/leann-core/src apps
build: build:
needs: lint needs: [lint, type-check]
name: Build ${{ matrix.os }} Python ${{ matrix.python }} name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy: strategy:
matrix: matrix:

View File

@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
import argparse import argparse
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Any, Union from typing import Any
import dotenv import dotenv
from leann.api import LeannBuilder, LeannChat from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
pass pass
@abstractmethod @abstractmethod
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key).""" """Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
pass pass
def get_llm_config(self, args) -> dict[str, Any]: def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
return config return config
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str: async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
"""Build LEANN index from texts (accepts strings or dicts with 'text' key).""" """Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
print(f"\n[Building Index] Creating {self.name} index...") print(f"\n[Building Index] Creating {self.name} index...")

View File

@@ -6,6 +6,7 @@ Supports Chrome browser history.
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample):
return profiles return profiles
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load browser history and convert to text chunks.""" """Load browser history and convert to text chunks."""
# Determine Chrome profiles # Determine Chrome profiles
if args.chrome_profile and not args.auto_find_profiles: if args.chrome_profile and not args.auto_find_profiles:

View File

@@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample):
return export_files return export_files
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load ChatGPT export data and convert to text chunks.""" """Load ChatGPT export data and convert to text chunks."""
export_path = Path(args.export_path) export_path = Path(args.export_path)

View File

@@ -5,6 +5,7 @@ Supports Claude export data from JSON files.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample):
return export_files return export_files
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Claude export data and convert to text chunks.""" """Load Claude export data and convert to text chunks."""
export_path = Path(args.export_path) export_path = Path(args.export_path)

View File

@@ -6,6 +6,7 @@ optimized chunking parameters.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample):
help="Try to preserve import statements in chunks (default: True)", help="Try to preserve import statements in chunks (default: True)",
) )
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load code files and convert to AST-aware chunks.""" """Load code files and convert to AST-aware chunks."""
print(f"🔍 Scanning code repository: {args.repo_dir}") print(f"🔍 Scanning code repository: {args.repo_dir}")
print(f"📁 Including extensions: {args.include_extensions}") print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample):
if not repo_path.exists(): if not repo_path.exists():
raise ValueError(f"Repository directory not found: {args.repo_dir}") raise ValueError(f"Repository directory not found: {args.repo_dir}")
# Load code files with filtering
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
"required_exts": args.include_extensions,
"exclude_hidden": True,
}
# Create exclusion filter # Create exclusion filter
def file_filter(file_path: str) -> bool: def file_filter(file_path: str) -> bool:
"""Filter out unwanted files and directories.""" """Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample):
# Load documents with file filtering # Load documents with file filtering
documents = SimpleDirectoryReader( documents = SimpleDirectoryReader(
args.repo_dir, args.repo_dir,
file_extractor=None, # Use default extractors file_extractor=None,
**reader_kwargs, recursive=True,
encoding="utf-8",
required_exts=args.include_extensions,
exclude_hidden=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
# Apply custom filtering # Apply custom filtering

View File

@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any, Union from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
help="Enable AST-aware chunking for code files in the data directory", help="Enable AST-aware chunking for code files in the data directory",
) )
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load documents and convert to text chunks.""" """Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}") print(f"Loading documents from: {args.data_dir}")
if args.file_types: if args.file_types:
@@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample):
raise ValueError(f"Data directory not found: {args.data_dir}") raise ValueError(f"Data directory not found: {args.data_dir}")
# Load documents # Load documents
reader_kwargs = { documents = SimpleDirectoryReader(
"recursive": True, args.data_dir,
"encoding": "utf-8", recursive=True,
} encoding="utf-8",
if args.file_types: required_exts=args.file_types if args.file_types else None,
reader_kwargs["required_exts"] = args.file_types ).load_data(show_progress=True)
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
show_progress=True
)
if not documents: if not documents:
print(f"No documents found in {args.data_dir} with extensions {args.file_types}") print(f"No documents found in {args.data_dir} with extensions {args.file_types}")

View File

@@ -5,6 +5,7 @@ Supports Apple Mail on macOS.
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample):
return messages_dirs return messages_dirs
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load emails and convert to text chunks.""" """Load emails and convert to text chunks."""
# Determine mail directories # Determine mail directories
if args.mail_path: if args.mail_path:

View File

@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
text=True, text=True,
timeout=5, timeout=5,
) )
return result.returncode == 0 and result.stdout.strip() return result.returncode == 0 and bool(result.stdout.strip())
except Exception: except Exception:
return False return False
@@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader):
return concatenated_groups return concatenated_groups
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str: def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]:
""" """
Create concatenated content from a group of messages. Create concatenated content from a group of messages.

View File

@@ -14,6 +14,7 @@ import argparse
import pickle import pickle
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Any
import numpy as np import numpy as np
from PIL import Image from PIL import Image
@@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample):
help="Batch size for CLIP embedding generation (default: 32)", help="Batch size for CLIP embedding generation (default: 32)",
) )
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load images, generate CLIP embeddings, and return text descriptions.""" """Load images, generate CLIP embeddings, and return text descriptions."""
self._image_data = self._load_images_and_embeddings(args) self._image_data = self._load_images_and_embeddings(args)
return [entry["text"] for entry in self._image_data] return [entry["text"] for entry in self._image_data]

View File

@@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Any
from leann.chunking_utils import create_text_chunks from leann.chunking_utils import create_text_chunks
@@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample):
help="Overlap between text chunks (default: 200)", help="Overlap between text chunks (default: 200)",
) )
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load iMessage history and convert to text chunks.""" """Load iMessage history and convert to text chunks."""
print("Loading iMessage conversation history...") print("Loading iMessage conversation history...")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse import argparse
import asyncio import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample from apps.base_rag_example import BaseRAGExample
from apps.slack_data.slack_mcp_reader import SlackMCPReader from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample):
print("4. Try running the MCP server command directly to test it") print("4. Try running the MCP server command directly to test it")
return False return False
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Slack messages via MCP server.""" """Load Slack messages via MCP server."""
print(f"Connecting to Slack MCP server: {args.mcp_server}") print(f"Connecting to Slack MCP server: {args.mcp_server}")

View File

@@ -11,6 +11,7 @@ Usage:
import argparse import argparse
import asyncio import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample from apps.base_rag_example import BaseRAGExample
from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample):
print("5. Try running the MCP server command directly to test it") print("5. Try running the MCP server command directly to test it")
return False return False
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load Twitter bookmarks via MCP server.""" """Load Twitter bookmarks via MCP server."""
print(f"Connecting to Twitter MCP server: {args.mcp_server}") print(f"Connecting to Twitter MCP server: {args.mcp_server}")

View File

@@ -6,6 +6,7 @@ Supports WeChat chat history export and search.
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample):
print(f"Export error: {e}") print(f"Export error: {e}")
return False return False
async def load_data(self, args) -> list[str]: async def load_data(self, args) -> list[dict[str, Any]]:
"""Load WeChat history and convert to text chunks.""" """Load WeChat history and convert to text chunks."""
# Initialize WeChat reader with export capabilities # Initialize WeChat reader with export capabilities
reader = WeChatHistoryReader() reader = WeChatHistoryReader()

View File

@@ -239,8 +239,8 @@ def create_ast_chunks(
chunks = chunk_builder.chunkify(code_content) chunks = chunk_builder.chunkify(code_content)
for chunk in chunks: for chunk in chunks:
chunk_text = None chunk_text: str | None = None
astchunk_metadata = {} astchunk_metadata: dict[str, Any] = {}
if hasattr(chunk, "text"): if hasattr(chunk, "text"):
chunk_text = chunk.text chunk_text = chunk.text

View File

@@ -19,7 +19,7 @@ from .settings import (
) )
def extract_pdf_text_with_pymupdf(file_path: str) -> str: def extract_pdf_text_with_pymupdf(file_path: str) -> str | None:
"""Extract text from PDF using PyMuPDF for better quality.""" """Extract text from PDF using PyMuPDF for better quality."""
try: try:
import fitz # PyMuPDF import fitz # PyMuPDF
@@ -35,7 +35,7 @@ def extract_pdf_text_with_pymupdf(file_path: str) -> str:
return None return None
def extract_pdf_text_with_pdfplumber(file_path: str) -> str: def extract_pdf_text_with_pdfplumber(file_path: str) -> str | None:
"""Extract text from PDF using pdfplumber for better quality.""" """Extract text from PDF using pdfplumber for better quality."""
try: try:
import pdfplumber import pdfplumber

View File

@@ -11,14 +11,15 @@ from pathlib import Path
from typing import Callable, Optional from typing import Callable, Optional
# Try to import readline with fallback for Windows # Try to import readline with fallback for Windows
HAS_READLINE = False
readline = None # type: ignore[assignment]
try: try:
import readline import readline # type: ignore[no-redef]
HAS_READLINE = True HAS_READLINE = True
except ImportError: except ImportError:
# Windows doesn't have readline by default # Windows doesn't have readline by default
HAS_READLINE = False pass
readline = None
class InteractiveSession: class InteractiveSession:

View File

@@ -157,6 +157,19 @@ exclude = ["localhost", "127.0.0.1", "example.com"]
exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"] exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"]
scheme = ["https", "http"] scheme = ["https", "http"]
[tool.ty]
# Type checking with ty (Astral's fast Python type checker)
# ty is 10-100x faster than mypy. See: https://docs.astral.sh/ty/
[tool.ty.environment]
python-version = "3.11"
extra-paths = ["apps", "packages/leann-core/src"]
[tool.ty.rules]
# Disable some noisy rules that have many false positives
possibly-missing-attribute = "ignore"
unresolved-import = "ignore" # Many optional dependencies
[tool.pytest.ini_options] [tool.pytest.ini_options]
testpaths = ["tests"] testpaths = ["tests"]
python_files = ["test_*.py"] python_files = ["test_*.py"]