From d83a463c266377b3a8cb9db0852034bbbb8e715f Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 23 Dec 2025 09:04:20 +0000 Subject: [PATCH] Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. --- .github/workflows/build-reusable.yml | 23 ++++++++++++++++++- apps/base_rag_example.py | 10 ++++---- apps/browser_rag.py | 3 ++- apps/chatgpt_rag.py | 3 ++- apps/claude_rag.py | 3 ++- apps/code_rag.py | 18 ++++++--------- apps/document_rag.py | 20 +++++++--------- apps/email_rag.py | 3 ++- apps/history_data/wechat_history.py | 4 ++-- apps/image_rag.py | 3 ++- apps/imessage_rag.py | 3 ++- apps/slack_rag.py | 3 ++- apps/twitter_rag.py | 3 ++- apps/wechat_rag.py | 3 ++- .../leann-core/src/leann/chunking_utils.py | 4 ++-- packages/leann-core/src/leann/cli.py | 4 ++-- .../leann-core/src/leann/interactive_utils.py | 7 +++--- pyproject.toml | 13 +++++++++++ 18 files changed, 83 insertions(+), 47 deletions(-) diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index 6dfb43d..9f7dfee 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -28,9 +28,30 @@ jobs: run: | uv run --only-group lint pre-commit run --all-files --show-diff-on-failure + type-check: + name: Type Check with ty + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + submodules: recursive + + - name: Install uv and Python + uses: astral-sh/setup-uv@v6 + with: + python-version: '3.11' + + - name: Install ty + run: uv tool install ty + + - name: Run ty type checker + run: | + # Run ty on core packages and apps, excluding multimodal and tests + ty check --exclude "apps/multimodal/**" --exclude "tests/**" packages/leann-core/src apps build: - needs: lint + needs: [lint, type-check] name: Build ${{ matrix.os }} Python ${{ matrix.python }} strategy: matrix: diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py index f695610..1517191 100644 --- a/apps/base_rag_example.py +++ b/apps/base_rag_example.py @@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples. import argparse from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Union +from typing import Any import dotenv from leann.api import LeannBuilder, LeannChat @@ -257,8 +257,8 @@ class BaseRAGExample(ABC): pass @abstractmethod - async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: - """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key).""" + async def load_data(self, args) -> list[dict[str, Any]]: + """Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys.""" pass def get_llm_config(self, args) -> dict[str, Any]: @@ -282,8 +282,8 @@ class BaseRAGExample(ABC): return config - async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str: - """Build LEANN index from texts (accepts strings or dicts with 'text' key).""" + async def build_index(self, args, texts: list[dict[str, Any]]) -> str: + """Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys).""" index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") print(f"\n[Building Index] Creating {self.name} index...") diff --git a/apps/browser_rag.py b/apps/browser_rag.py index 6d21964..00bb3f5 100644 --- a/apps/browser_rag.py +++ b/apps/browser_rag.py @@ -6,6 +6,7 @@ Supports Chrome browser history. import os import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample): return profiles - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load browser history and convert to text chunks.""" # Determine Chrome profiles if args.chrome_profile and not args.auto_find_profiles: diff --git a/apps/chatgpt_rag.py b/apps/chatgpt_rag.py index 3c92d04..c97d2cd 100644 --- a/apps/chatgpt_rag.py +++ b/apps/chatgpt_rag.py @@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files. import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample): return export_files - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load ChatGPT export data and convert to text chunks.""" export_path = Path(args.export_path) diff --git a/apps/claude_rag.py b/apps/claude_rag.py index 43b499e..2cc80dd 100644 --- a/apps/claude_rag.py +++ b/apps/claude_rag.py @@ -5,6 +5,7 @@ Supports Claude export data from JSON files. import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample): return export_files - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load Claude export data and convert to text chunks.""" export_path = Path(args.export_path) diff --git a/apps/code_rag.py b/apps/code_rag.py index 7518bb9..452e0a6 100644 --- a/apps/code_rag.py +++ b/apps/code_rag.py @@ -6,6 +6,7 @@ optimized chunking parameters. import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample): help="Try to preserve import statements in chunks (default: True)", ) - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load code files and convert to AST-aware chunks.""" print(f"🔍 Scanning code repository: {args.repo_dir}") print(f"📁 Including extensions: {args.include_extensions}") @@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample): if not repo_path.exists(): raise ValueError(f"Repository directory not found: {args.repo_dir}") - # Load code files with filtering - reader_kwargs = { - "recursive": True, - "encoding": "utf-8", - "required_exts": args.include_extensions, - "exclude_hidden": True, - } - # Create exclusion filter def file_filter(file_path: str) -> bool: """Filter out unwanted files and directories.""" @@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample): # Load documents with file filtering documents = SimpleDirectoryReader( args.repo_dir, - file_extractor=None, # Use default extractors - **reader_kwargs, + file_extractor=None, + recursive=True, + encoding="utf-8", + required_exts=args.include_extensions, + exclude_hidden=True, ).load_data(show_progress=True) # Apply custom filtering diff --git a/apps/document_rag.py b/apps/document_rag.py index 280d0fb..f8e0c66 100644 --- a/apps/document_rag.py +++ b/apps/document_rag.py @@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats. import sys from pathlib import Path -from typing import Any, Union +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample): help="Enable AST-aware chunking for code files in the data directory", ) - async def load_data(self, args) -> list[Union[str, dict[str, Any]]]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load documents and convert to text chunks.""" print(f"Loading documents from: {args.data_dir}") if args.file_types: @@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample): raise ValueError(f"Data directory not found: {args.data_dir}") # Load documents - reader_kwargs = { - "recursive": True, - "encoding": "utf-8", - } - if args.file_types: - reader_kwargs["required_exts"] = args.file_types - - documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data( - show_progress=True - ) + documents = SimpleDirectoryReader( + args.data_dir, + recursive=True, + encoding="utf-8", + required_exts=args.file_types if args.file_types else None, + ).load_data(show_progress=True) if not documents: print(f"No documents found in {args.data_dir} with extensions {args.file_types}") diff --git a/apps/email_rag.py b/apps/email_rag.py index ec87bb1..0558678 100644 --- a/apps/email_rag.py +++ b/apps/email_rag.py @@ -5,6 +5,7 @@ Supports Apple Mail on macOS. import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample): return messages_dirs - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load emails and convert to text chunks.""" # Determine mail directories if args.mail_path: diff --git a/apps/history_data/wechat_history.py b/apps/history_data/wechat_history.py index e985bd4..f65f77c 100644 --- a/apps/history_data/wechat_history.py +++ b/apps/history_data/wechat_history.py @@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader): text=True, timeout=5, ) - return result.returncode == 0 and result.stdout.strip() + return result.returncode == 0 and bool(result.stdout.strip()) except Exception: return False @@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader): return concatenated_groups - def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str: + def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]: """ Create concatenated content from a group of messages. diff --git a/apps/image_rag.py b/apps/image_rag.py index 4c33b69..2a1d110 100644 --- a/apps/image_rag.py +++ b/apps/image_rag.py @@ -14,6 +14,7 @@ import argparse import pickle import tempfile from pathlib import Path +from typing import Any import numpy as np from PIL import Image @@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample): help="Batch size for CLIP embedding generation (default: 32)", ) - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load images, generate CLIP embeddings, and return text descriptions.""" self._image_data = self._load_images_and_embeddings(args) return [entry["text"] for entry in self._image_data] diff --git a/apps/imessage_rag.py b/apps/imessage_rag.py index 50032ec..bd4ab68 100644 --- a/apps/imessage_rag.py +++ b/apps/imessage_rag.py @@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio import asyncio from pathlib import Path +from typing import Any from leann.chunking_utils import create_text_chunks @@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample): help="Overlap between text chunks (default: 200)", ) - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load iMessage history and convert to text chunks.""" print("Loading iMessage conversation history...") diff --git a/apps/slack_rag.py b/apps/slack_rag.py index 1135a59..cf29aa6 100644 --- a/apps/slack_rag.py +++ b/apps/slack_rag.py @@ -11,6 +11,7 @@ Usage: import argparse import asyncio +from typing import Any from apps.base_rag_example import BaseRAGExample from apps.slack_data.slack_mcp_reader import SlackMCPReader @@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample): print("4. Try running the MCP server command directly to test it") return False - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load Slack messages via MCP server.""" print(f"Connecting to Slack MCP server: {args.mcp_server}") diff --git a/apps/twitter_rag.py b/apps/twitter_rag.py index a7fd3a4..15abf24 100644 --- a/apps/twitter_rag.py +++ b/apps/twitter_rag.py @@ -11,6 +11,7 @@ Usage: import argparse import asyncio +from typing import Any from apps.base_rag_example import BaseRAGExample from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader @@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample): print("5. Try running the MCP server command directly to test it") return False - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load Twitter bookmarks via MCP server.""" print(f"Connecting to Twitter MCP server: {args.mcp_server}") diff --git a/apps/wechat_rag.py b/apps/wechat_rag.py index 7355c6f..1e5dd31 100644 --- a/apps/wechat_rag.py +++ b/apps/wechat_rag.py @@ -6,6 +6,7 @@ Supports WeChat chat history export and search. import subprocess import sys from pathlib import Path +from typing import Any # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample): print(f"Export error: {e}") return False - async def load_data(self, args) -> list[str]: + async def load_data(self, args) -> list[dict[str, Any]]: """Load WeChat history and convert to text chunks.""" # Initialize WeChat reader with export capabilities reader = WeChatHistoryReader() diff --git a/packages/leann-core/src/leann/chunking_utils.py b/packages/leann-core/src/leann/chunking_utils.py index 34e0779..e7f0a39 100644 --- a/packages/leann-core/src/leann/chunking_utils.py +++ b/packages/leann-core/src/leann/chunking_utils.py @@ -239,8 +239,8 @@ def create_ast_chunks( chunks = chunk_builder.chunkify(code_content) for chunk in chunks: - chunk_text = None - astchunk_metadata = {} + chunk_text: str | None = None + astchunk_metadata: dict[str, Any] = {} if hasattr(chunk, "text"): chunk_text = chunk.text diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 708892a..ce51637 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -19,7 +19,7 @@ from .settings import ( ) -def extract_pdf_text_with_pymupdf(file_path: str) -> str: +def extract_pdf_text_with_pymupdf(file_path: str) -> str | None: """Extract text from PDF using PyMuPDF for better quality.""" try: import fitz # PyMuPDF @@ -35,7 +35,7 @@ def extract_pdf_text_with_pymupdf(file_path: str) -> str: return None -def extract_pdf_text_with_pdfplumber(file_path: str) -> str: +def extract_pdf_text_with_pdfplumber(file_path: str) -> str | None: """Extract text from PDF using pdfplumber for better quality.""" try: import pdfplumber diff --git a/packages/leann-core/src/leann/interactive_utils.py b/packages/leann-core/src/leann/interactive_utils.py index 56f7731..ac803d2 100644 --- a/packages/leann-core/src/leann/interactive_utils.py +++ b/packages/leann-core/src/leann/interactive_utils.py @@ -11,14 +11,15 @@ from pathlib import Path from typing import Callable, Optional # Try to import readline with fallback for Windows +HAS_READLINE = False +readline = None # type: ignore[assignment] try: - import readline + import readline # type: ignore[no-redef] HAS_READLINE = True except ImportError: # Windows doesn't have readline by default - HAS_READLINE = False - readline = None + pass class InteractiveSession: diff --git a/pyproject.toml b/pyproject.toml index 91d4322..3f31ed1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,6 +157,19 @@ exclude = ["localhost", "127.0.0.1", "example.com"] exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"] scheme = ["https", "http"] +[tool.ty] +# Type checking with ty (Astral's fast Python type checker) +# ty is 10-100x faster than mypy. See: https://docs.astral.sh/ty/ + +[tool.ty.environment] +python-version = "3.11" +extra-paths = ["apps", "packages/leann-core/src"] + +[tool.ty.rules] +# Disable some noisy rules that have many false positives +possibly-missing-attribute = "ignore" +unresolved-import = "ignore" # Many optional dependencies + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"]