Add ty type checker to CI and fix type errors (fixes bug from PR #157) (#192)

* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
2025-12-24 23:58:06 -08:00
parent a2e5f5294b
commit 198044d033
32 changed files with 261 additions and 144 deletions
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -91,7 +91,7 @@ def test_large_index():
        builder.build_index(index_path)

        searcher = LeannSearcher(index_path)
-        results = searcher.search(["word10 word20"], top_k=10)
-        assert len(results[0]) == 10
+        results = searcher.search("word10 word20", top_k=10)
+        assert len(results) == 10
        # Cleanup
        searcher.cleanup()
--- a/tests/test_cli_prompt_template.py
+++ b/tests/test_cli_prompt_template.py
@@ -123,7 +123,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
        cli = LeannCLI()

        # Mock load_documents to return a document so builder is created
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

@@ -175,7 +175,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
        cli = LeannCLI()

        # Mock load_documents to return a document so builder is created
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

@@ -230,7 +230,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
        cli = LeannCLI()

        # Mock load_documents to return a document so builder is created
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

@@ -307,7 +307,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
        cli = LeannCLI()

        # Mock load_documents to return a document so builder is created
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

@@ -376,7 +376,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
        cli = LeannCLI()

        # Mock load_documents to return a document so builder is created
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

@@ -432,7 +432,7 @@ class TestPromptTemplateFlowsToComputeEmbeddings:
        cli = LeannCLI()

        # Mock load_documents to return a simple document
-        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
+        cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])  # type: ignore[assignment]

        parser = cli.create_parser()

--- a/tests/test_prompt_template_e2e.py
+++ b/tests/test_prompt_template_e2e.py
@@ -67,7 +67,7 @@ def check_lmstudio_available() -> bool:
        return False


-def get_lmstudio_first_model() -> str:
+def get_lmstudio_first_model() -> str | None:
    """Get the first available model from LM Studio."""
    try:
        response = requests.get("http://localhost:1234/v1/models", timeout=5.0)
@@ -91,6 +91,7 @@ class TestPromptTemplateOpenAI:
        model_name = get_lmstudio_first_model()
        if not model_name:
            pytest.skip("No models loaded in LM Studio")
+        assert model_name is not None  # Type narrowing for type checker

        texts = ["artificial intelligence", "machine learning"]
        prompt_template = "search_query: "
@@ -120,6 +121,7 @@ class TestPromptTemplateOpenAI:
        model_name = get_lmstudio_first_model()
        if not model_name:
            pytest.skip("No models loaded in LM Studio")
+        assert model_name is not None  # Type narrowing for type checker

        text = "machine learning"
        base_url = "http://localhost:1234/v1"
@@ -271,6 +273,7 @@ class TestLMStudioSDK:
        model_name = get_lmstudio_first_model()
        if not model_name:
            pytest.skip("No models loaded in LM Studio")
+        assert model_name is not None  # Type narrowing for type checker

        try:
            from leann.embedding_compute import _query_lmstudio_context_limit
--- a/tests/test_prompt_template_persistence.py
+++ b/tests/test_prompt_template_persistence.py
@@ -581,7 +581,18 @@ class TestQueryTemplateApplicationInComputeEmbedding:

        # Create a concrete implementation for testing
        class TestSearcher(BaseSearcher):
-            def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
+            def search(
+                self,
+                query,
+                top_k,
+                complexity=64,
+                beam_width=1,
+                prune_ratio=0.0,
+                recompute_embeddings=False,
+                pruning_strategy="global",
+                zmq_port=None,
+                **kwargs,
+            ):
                return {"labels": [], "distances": []}

        searcher = object.__new__(TestSearcher)
@@ -625,7 +636,18 @@ class TestQueryTemplateApplicationInComputeEmbedding:

        # Create a concrete implementation for testing
        class TestSearcher(BaseSearcher):
-            def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
+            def search(
+                self,
+                query,
+                top_k,
+                complexity=64,
+                beam_width=1,
+                prune_ratio=0.0,
+                recompute_embeddings=False,
+                pruning_strategy="global",
+                zmq_port=None,
+                **kwargs,
+            ):
                return {"labels": [], "distances": []}

        searcher = object.__new__(TestSearcher)
@@ -671,7 +693,18 @@ class TestQueryTemplateApplicationInComputeEmbedding:
        from leann.searcher_base import BaseSearcher

        class TestSearcher(BaseSearcher):
-            def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
+            def search(
+                self,
+                query,
+                top_k,
+                complexity=64,
+                beam_width=1,
+                prune_ratio=0.0,
+                recompute_embeddings=False,
+                pruning_strategy="global",
+                zmq_port=None,
+                **kwargs,
+            ):
                return {"labels": [], "distances": []}

        searcher = object.__new__(TestSearcher)
@@ -710,7 +743,18 @@ class TestQueryTemplateApplicationInComputeEmbedding:
        from leann.searcher_base import BaseSearcher

        class TestSearcher(BaseSearcher):
-            def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
+            def search(
+                self,
+                query,
+                top_k,
+                complexity=64,
+                beam_width=1,
+                prune_ratio=0.0,
+                recompute_embeddings=False,
+                pruning_strategy="global",
+                zmq_port=None,
+                **kwargs,
+            ):
                return {"labels": [], "distances": []}

        searcher = object.__new__(TestSearcher)
@@ -774,7 +818,18 @@ class TestQueryTemplateApplicationInComputeEmbedding:
        from leann.searcher_base import BaseSearcher

        class TestSearcher(BaseSearcher):
-            def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
+            def search(
+                self,
+                query,
+                top_k,
+                complexity=64,
+                beam_width=1,
+                prune_ratio=0.0,
+                recompute_embeddings=False,
+                pruning_strategy="global",
+                zmq_port=None,
+                **kwargs,
+            ):
                return {"labels": [], "distances": []}

        searcher = object.__new__(TestSearcher)
--- a/tests/test_readme_examples.py
+++ b/tests/test_readme_examples.py
@@ -97,17 +97,17 @@ def test_backend_options():

    with tempfile.TemporaryDirectory() as temp_dir:
        # Use smaller model in CI to avoid memory issues
-        if os.environ.get("CI") == "true":
-            model_args = {
-                "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
-                "dimensions": 384,
-            }
-        else:
-            model_args = {}
+        is_ci = os.environ.get("CI") == "true"
+        embedding_model = (
+            "sentence-transformers/all-MiniLM-L6-v2" if is_ci else "facebook/contriever"
+        )
+        dimensions = 384 if is_ci else None

        # Test HNSW backend (as shown in README)
        hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
-        builder_hnsw = LeannBuilder(backend_name="hnsw", **model_args)
+        builder_hnsw = LeannBuilder(
+            backend_name="hnsw", embedding_model=embedding_model, dimensions=dimensions
+        )
        builder_hnsw.add_text("Test document for HNSW backend")
        builder_hnsw.build_index(hnsw_path)
        assert Path(hnsw_path).parent.exists()
@@ -115,7 +115,9 @@ def test_backend_options():

        # Test DiskANN backend (mentioned as available option)
        diskann_path = str(Path(temp_dir) / "test_diskann.leann")
-        builder_diskann = LeannBuilder(backend_name="diskann", **model_args)
+        builder_diskann = LeannBuilder(
+            backend_name="diskann", embedding_model=embedding_model, dimensions=dimensions
+        )
        builder_diskann.add_text("Test document for DiskANN backend")
        builder_diskann.build_index(diskann_path)
        assert Path(diskann_path).parent.exists()