Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated.
2025-12-23 09:04:20 +00:00
parent 8a2ea37871
commit d83a463c26
18 changed files with 83 additions and 47 deletions
--- a/apps/base_rag_example.py
+++ b/apps/base_rag_example.py
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
 import argparse
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Union
+from typing import Any

 import dotenv
 from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
        pass

    @abstractmethod
-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
-        """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
+    async def load_data(self, args) -> list[dict[str, Any]]:
+        """Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
        pass

    def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):

        return config

-    async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
-        """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
+    async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
+        """Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
        index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")

        print(f"\n[Building Index] Creating {self.name} index...")
--- a/apps/browser_rag.py
+++ b/apps/browser_rag.py
@@ -6,6 +6,7 @@ Supports Chrome browser history.
 import os
 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ class BrowserRAG(BaseRAGExample):

        return profiles

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load browser history and convert to text chunks."""
        # Determine Chrome profiles
        if args.chrome_profile and not args.auto_find_profiles:
--- a/apps/chatgpt_rag.py
+++ b/apps/chatgpt_rag.py
@@ -5,6 +5,7 @@ Supports ChatGPT export data from chat.html files.

 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ChatGPTRAG(BaseRAGExample):

        return export_files

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load ChatGPT export data and convert to text chunks."""
        export_path = Path(args.export_path)

--- a/apps/claude_rag.py
+++ b/apps/claude_rag.py
@@ -5,6 +5,7 @@ Supports Claude export data from JSON files.

 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ class ClaudeRAG(BaseRAGExample):

        return export_files

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load Claude export data and convert to text chunks."""
        export_path = Path(args.export_path)

--- a/apps/code_rag.py
+++ b/apps/code_rag.py
@@ -6,6 +6,7 @@ optimized chunking parameters.

 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ class CodeRAG(BaseRAGExample):
            help="Try to preserve import statements in chunks (default: True)",
        )

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load code files and convert to AST-aware chunks."""
        print(f"🔍 Scanning code repository: {args.repo_dir}")
        print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ class CodeRAG(BaseRAGExample):
        if not repo_path.exists():
            raise ValueError(f"Repository directory not found: {args.repo_dir}")

-        # Load code files with filtering
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-            "required_exts": args.include_extensions,
-            "exclude_hidden": True,
-        }
-
        # Create exclusion filter
        def file_filter(file_path: str) -> bool:
            """Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ class CodeRAG(BaseRAGExample):
            # Load documents with file filtering
            documents = SimpleDirectoryReader(
                args.repo_dir,
-                file_extractor=None,  # Use default extractors
-                **reader_kwargs,
+                file_extractor=None,
+                recursive=True,
+                encoding="utf-8",
+                required_exts=args.include_extensions,
+                exclude_hidden=True,
            ).load_data(show_progress=True)

            # Apply custom filtering
--- a/apps/document_rag.py
+++ b/apps/document_rag.py
@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.

 import sys
 from pathlib import Path
-from typing import Any, Union
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
            help="Enable AST-aware chunking for code files in the data directory",
        )

-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load documents and convert to text chunks."""
        print(f"Loading documents from: {args.data_dir}")
        if args.file_types:
@@ -66,16 +66,12 @@ class DocumentRAG(BaseRAGExample):
            raise ValueError(f"Data directory not found: {args.data_dir}")

        # Load documents
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-        }
-        if args.file_types:
-            reader_kwargs["required_exts"] = args.file_types
-
-        documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
-            show_progress=True
-        )
+        documents = SimpleDirectoryReader(
+            args.data_dir,
+            recursive=True,
+            encoding="utf-8",
+            required_exts=args.file_types if args.file_types else None,
+        ).load_data(show_progress=True)

        if not documents:
            print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
--- a/apps/email_rag.py
+++ b/apps/email_rag.py
@@ -5,6 +5,7 @@ Supports Apple Mail on macOS.

 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ class EmailRAG(BaseRAGExample):

        return messages_dirs

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load emails and convert to text chunks."""
        # Determine mail directories
        if args.mail_path:
--- a/apps/history_data/wechat_history.py
+++ b/apps/history_data/wechat_history.py
@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
                text=True,
                timeout=5,
            )
-            return result.returncode == 0 and result.stdout.strip()
+            return result.returncode == 0 and bool(result.stdout.strip())
        except Exception:
            return False

@@ -314,7 +314,7 @@ class WeChatHistoryReader(BaseReader):

        return concatenated_groups

-    def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
+    def _create_concatenated_content(self, message_group: dict, contact_name: str) -> tuple[str, str]:
        """
        Create concatenated content from a group of messages.

--- a/apps/image_rag.py
+++ b/apps/image_rag.py
@@ -14,6 +14,7 @@ import argparse
 import pickle
 import tempfile
 from pathlib import Path
+from typing import Any

 import numpy as np
 from PIL import Image
@@ -65,7 +66,7 @@ class ImageRAG(BaseRAGExample):
            help="Batch size for CLIP embedding generation (default: 32)",
        )

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load images, generate CLIP embeddings, and return text descriptions."""
        self._image_data = self._load_images_and_embeddings(args)
        return [entry["text"] for entry in self._image_data]
--- a/apps/imessage_rag.py
+++ b/apps/imessage_rag.py
@@ -6,6 +6,7 @@ This example demonstrates how to build a RAG system on your iMessage conversatio

 import asyncio
 from pathlib import Path
+from typing import Any

 from leann.chunking_utils import create_text_chunks

@@ -56,7 +57,7 @@ class IMessageRAG(BaseRAGExample):
            help="Overlap between text chunks (default: 200)",
        )

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load iMessage history and convert to text chunks."""
        print("Loading iMessage conversation history...")

--- a/apps/slack_rag.py
+++ b/apps/slack_rag.py
@@ -11,6 +11,7 @@ Usage:

 import argparse
 import asyncio
+from typing import Any

 from apps.base_rag_example import BaseRAGExample
 from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -139,7 +140,7 @@ class SlackMCPRAG(BaseRAGExample):
            print("4. Try running the MCP server command directly to test it")
            return False

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load Slack messages via MCP server."""
        print(f"Connecting to Slack MCP server: {args.mcp_server}")

--- a/apps/twitter_rag.py
+++ b/apps/twitter_rag.py
@@ -11,6 +11,7 @@ Usage:

 import argparse
 import asyncio
+from typing import Any

 from apps.base_rag_example import BaseRAGExample
 from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -116,7 +117,7 @@ class TwitterMCPRAG(BaseRAGExample):
            print("5. Try running the MCP server command directly to test it")
            return False

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load Twitter bookmarks via MCP server."""
        print(f"Connecting to Twitter MCP server: {args.mcp_server}")

--- a/apps/wechat_rag.py
+++ b/apps/wechat_rag.py
@@ -6,6 +6,7 @@ Supports WeChat chat history export and search.
 import subprocess
 import sys
 from pathlib import Path
+from typing import Any

 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -91,7 +92,7 @@ class WeChatRAG(BaseRAGExample):
            print(f"Export error: {e}")
            return False

-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
        """Load WeChat history and convert to text chunks."""
        # Initialize WeChat reader with export capabilities
        reader = WeChatHistoryReader()