From d83a463c266377b3a8cb9db0852034bbbb8e715f Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Tue, 23 Dec 2025 09:04:20 +0000
Subject: [PATCH] Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.
---
 .github/workflows/build-reusable.yml          | 23 ++++++++++++++++++-
 apps/base_rag_example.py                      | 10 ++++----
 apps/browser_rag.py                           |  3 ++-
 apps/chatgpt_rag.py                           |  3 ++-
 apps/claude_rag.py                            |  3 ++-
 apps/code_rag.py                              | 18 ++++++---------
 apps/document_rag.py                          | 20 +++++++---------
 apps/email_rag.py                             |  3 ++-
 apps/history_data/wechat_history.py           |  4 ++--
 apps/image_rag.py                             |  3 ++-
 apps/imessage_rag.py                          |  3 ++-
 apps/slack_rag.py                             |  3 ++-
 apps/twitter_rag.py                           |  3 ++-
 apps/wechat_rag.py                            |  3 ++-
 .../leann-core/src/leann/chunking_utils.py    |  4 ++--
 packages/leann-core/src/leann/cli.py          |  4 ++--
 .../leann-core/src/leann/interactive_utils.py |  7 +++---
 pyproject.toml                                | 13 +++++++++++
 18 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml
index 6dfb43d..9f7dfee 100644
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -28,9 +28,30 @@ jobs:
         run: |
           uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
 
+  type-check:
+    name: Type Check with ty
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          submodules: recursive
+
+      - name: Install uv and Python
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: '3.11'
+
+      - name: Install ty
+        run: uv tool install ty
+
+      - name: Run ty type checker
+        run: |
+          # Run ty on core packages and apps, excluding multimodal and tests
+          ty check --exclude "apps/multimodal/**" --exclude "tests/**" packages/leann-core/src apps
 
   build:
-    needs: lint
+    needs: [lint, type-check]
     name: Build ${{ matrix.os }} Python ${{ matrix.python }}
     strategy:
       matrix:
diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py
index f695610..1517191 100644
--- a/apps/base_rag_example.py
+++ b/apps/base_rag_example.py
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
 import argparse
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import dotenv
 from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
         pass
 
     @abstractmethod
-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
-        """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
+    async def load_data(self, args) -> list[dict[str, Any]]:
+        """Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
         pass
 
     def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
 
         return config
 
-    async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
-        """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
+    async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
+        """Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
         index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
 
         print(f"\n[Building Index] Creating {self.name} index...")
diff --git a/apps/browser_rag.py b/apps/browser_rag.py
index 6d21964..00bb3f5 100644
--- a/apps/browser_rag.py
+++ b/apps/browser_rag.py
@@ -6,6 +6,7 @@ Supports Chrome browser history.
 import os
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample):
 
         return profiles
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load browser history and convert to text chunks."""
         # Determine Chrome profiles
         if args.chrome_profile and not args.auto_find_profiles:
diff --git a/apps/chatgpt_rag.py b/apps/chatgpt_rag.py
index 3c92d04..c97d2cd 100644
--- a/apps/chatgpt_rag.py
+++ b/apps/chatgpt_rag.py
@@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files.
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample):
 
         return export_files
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load ChatGPT export data and convert to text chunks."""
         export_path = Path(args.export_path)
 
diff --git a/apps/claude_rag.py b/apps/claude_rag.py
index 43b499e..2cc80dd 100644
--- a/apps/claude_rag.py
+++ b/apps/claude_rag.py
@@ -5,6 +5,7 @@ Supports Claude export data from JSON files.
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample):
 
         return export_files
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load Claude export data and convert to text chunks."""
         export_path = Path(args.export_path)
 
diff --git a/apps/code_rag.py b/apps/code_rag.py
index 7518bb9..452e0a6 100644
--- a/apps/code_rag.py
+++ b/apps/code_rag.py
@@ -6,6 +6,7 @@ optimized chunking parameters.
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample):
             help="Try to preserve import statements in chunks (default: True)",
         )
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load code files and convert to AST-aware chunks."""
         print(f"🔍 Scanning code repository: {args.repo_dir}")
         print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample):
         if not repo_path.exists():
             raise ValueError(f"Repository directory not found: {args.repo_dir}")
 
-        # Load code files with filtering
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-            "required_exts": args.include_extensions,
-            "exclude_hidden": True,
-        }
-
         # Create exclusion filter
         def file_filter(file_path: str) -> bool:
             """Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample):
             # Load documents with file filtering
             documents = SimpleDirectoryReader(
                 args.repo_dir,
-                file_extractor=None,  # Use default extractors
-                **reader_kwargs,
+                file_extractor=None,
+                recursive=True,
+                encoding="utf-8",
+                required_exts=args.include_extensions,
+                exclude_hidden=True,
             ).load_data(show_progress=True)
 
             # Apply custom filtering
diff --git a/apps/document_rag.py b/apps/document_rag.py
index 280d0fb..f8e0c66 100644
--- a/apps/document_rag.py
+++ b/apps/document_rag.py
@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
 
 import sys
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
             help="Enable AST-aware chunking for code files in the data directory",
         )
 
-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
         if args.file_types:
@@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample):
             raise ValueError(f"Data directory not found: {args.data_dir}")
 
         # Load documents
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-        }
-        if args.file_types:
-            reader_kwargs["required_exts"] = args.file_types
-
-        documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
-            show_progress=True
-        )
+        documents = SimpleDirectoryReader(
+            args.data_dir,
+            recursive=True,
+            encoding="utf-8",
+            required_exts=args.file_types if args.file_types else None,
+        ).load_data(show_progress=True)
 
         if not documents:
             print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
diff --git a/apps/email_rag.py b/apps/email_rag.py
index ec87bb1..0558678 100644
--- a/apps/email_rag.py
+++ b/apps/email_rag.py
@@ -5,6 +5,7 @@ Supports Apple Mail on macOS.
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample):
 
         return messages_dirs
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load emails and convert to text chunks."""
         # Determine mail directories
         if args.mail_path:
diff --git a/apps/history_data/wechat_history.py b/apps/history_data/wechat_history.py
index e985bd4..f65f77c 100644
--- a/apps/history_data/wechat_history.py
+++ b/apps/history_data/wechat_history.py
@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
                 text=True,
                 timeout=5,
             )
-            return result.returncode == 0 and result.stdout.strip()
+            return result.returncode == 0 and bool(result.stdout.strip())
         except Exception:
             return False
 
@@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader):
 
         return concatenated_groups
 
-    def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
+    def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]:
         """
         Create concatenated content from a group of messages.
 
diff --git a/apps/image_rag.py b/apps/image_rag.py
index 4c33b69..2a1d110 100644
--- a/apps/image_rag.py
+++ b/apps/image_rag.py
@@ -14,6 +14,7 @@ import argparse
 import pickle
 import tempfile
 from pathlib import Path
+from typing import Any
 
 import numpy as np
 from PIL import Image
@@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample):
             help="Batch size for CLIP embedding generation (default: 32)",
         )
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load images, generate CLIP embeddings, and return text descriptions."""
         self._image_data = self._load_images_and_embeddings(args)
         return [entry["text"] for entry in self._image_data]
diff --git a/apps/imessage_rag.py b/apps/imessage_rag.py
index 50032ec..bd4ab68 100644
--- a/apps/imessage_rag.py
+++ b/apps/imessage_rag.py
@@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio
 
 import asyncio
 from pathlib import Path
+from typing import Any
 
 from leann.chunking_utils import create_text_chunks
 
@@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample):
             help="Overlap between text chunks (default: 200)",
         )
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load iMessage history and convert to text chunks."""
         print("Loading iMessage conversation history...")
 
diff --git a/apps/slack_rag.py b/apps/slack_rag.py
index 1135a59..cf29aa6 100644
--- a/apps/slack_rag.py
+++ b/apps/slack_rag.py
@@ -11,6 +11,7 @@ Usage:
 
 import argparse
 import asyncio
+from typing import Any
 
 from apps.base_rag_example import BaseRAGExample
 from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample):
             print("4. Try running the MCP server command directly to test it")
             return False
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load Slack messages via MCP server."""
         print(f"Connecting to Slack MCP server: {args.mcp_server}")
 
diff --git a/apps/twitter_rag.py b/apps/twitter_rag.py
index a7fd3a4..15abf24 100644
--- a/apps/twitter_rag.py
+++ b/apps/twitter_rag.py
@@ -11,6 +11,7 @@ Usage:
 
 import argparse
 import asyncio
+from typing import Any
 
 from apps.base_rag_example import BaseRAGExample
 from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample):
             print("5. Try running the MCP server command directly to test it")
             return False
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load Twitter bookmarks via MCP server."""
         print(f"Connecting to Twitter MCP server: {args.mcp_server}")
 
diff --git a/apps/wechat_rag.py b/apps/wechat_rag.py
index 7355c6f..1e5dd31 100644
--- a/apps/wechat_rag.py
+++ b/apps/wechat_rag.py
@@ -6,6 +6,7 @@ Supports WeChat chat history export and search.
 import subprocess
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample):
             print(f"Export error: {e}")
             return False
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load WeChat history and convert to text chunks."""
         # Initialize WeChat reader with export capabilities
         reader = WeChatHistoryReader()
diff --git a/packages/leann-core/src/leann/chunking_utils.py b/packages/leann-core/src/leann/chunking_utils.py
index 34e0779..e7f0a39 100644
--- a/packages/leann-core/src/leann/chunking_utils.py
+++ b/packages/leann-core/src/leann/chunking_utils.py
@@ -239,8 +239,8 @@ def create_ast_chunks(
 
             chunks = chunk_builder.chunkify(code_content)
             for chunk in chunks:
-                chunk_text = None
-                astchunk_metadata = {}
+                chunk_text: str | None = None
+                astchunk_metadata: dict[str, Any] = {}
 
                 if hasattr(chunk, "text"):
                     chunk_text = chunk.text
diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index 708892a..ce51637 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -19,7 +19,7 @@ from .settings import (
 )
 
 
-def extract_pdf_text_with_pymupdf(file_path: str) -> str:
+def extract_pdf_text_with_pymupdf(file_path: str) -> str | None:
     """Extract text from PDF using PyMuPDF for better quality."""
     try:
         import fitz  # PyMuPDF
@@ -35,7 +35,7 @@ def extract_pdf_text_with_pymupdf(file_path: str) -> str:
         return None
 
 
-def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
+def extract_pdf_text_with_pdfplumber(file_path: str) -> str | None:
     """Extract text from PDF using pdfplumber for better quality."""
     try:
         import pdfplumber
diff --git a/packages/leann-core/src/leann/interactive_utils.py b/packages/leann-core/src/leann/interactive_utils.py
index 56f7731..ac803d2 100644
--- a/packages/leann-core/src/leann/interactive_utils.py
+++ b/packages/leann-core/src/leann/interactive_utils.py
@@ -11,14 +11,15 @@ from pathlib import Path
 from typing import Callable, Optional
 
 # Try to import readline with fallback for Windows
+HAS_READLINE = False
+readline = None  # type: ignore[assignment]
 try:
-    import readline
+    import readline  # type: ignore[no-redef]
 
     HAS_READLINE = True
 except ImportError:
     # Windows doesn't have readline by default
-    HAS_READLINE = False
-    readline = None
+    pass
 
 
 class InteractiveSession:
diff --git a/pyproject.toml b/pyproject.toml
index 91d4322..3f31ed1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,6 +157,19 @@ exclude = ["localhost", "127.0.0.1", "example.com"]
 exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"]
 scheme = ["https", "http"]
 
+[tool.ty]
+# Type checking with ty (Astral's fast Python type checker)
+# ty is 10-100x faster than mypy. See: https://docs.astral.sh/ty/
+
+[tool.ty.environment]
+python-version = "3.11"
+extra-paths = ["apps", "packages/leann-core/src"]
+
+[tool.ty.rules]
+# Disable some noisy rules that have many false positives
+possibly-missing-attribute = "ignore"
+unresolved-import = "ignore"  # Many optional dependencies
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]