Files
Andy Lee 198044d033 Add ty type checker to CI and fix type errors (fixes bug from PR #157) (#192)
* Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.

* Fix remaining ty type errors

- Fix slack_mcp_reader.py channel parameter can be None
- Fix embedding_compute.py ContextProp type issue
- Fix searcher_base.py method override signatures
- Fix chunking_utils.py chunk_text assignment
- Fix slack_rag.py and twitter_rag.py return types
- Fix email.py and image_rag.py method overrides

* Fix multimodal benchmark scripts type errors

- Fix undefined LeannRetriever -> LeannMultiVector
- Add proper type casts for HuggingFace Dataset iteration
- Cast task config values to correct types
- Add type annotations for dataset row dicts

* Enable ty check for multimodal scripts in CI

All type errors in multimodal scripts have been fixed, so we can now
include them in the CI type checking.

* Fix all test type errors and enable ty check on tests

- Fix test_basic.py: search() takes str not list
- Fix test_cli_prompt_template.py: add type: ignore for Mock assignments
- Fix test_prompt_template_persistence.py: match BaseSearcher.search signature
- Fix test_prompt_template_e2e.py: add type narrowing asserts after skip
- Fix test_readme_examples.py: use explicit kwargs instead of **model_args
- Fix metadata_filter.py: allow Optional[MetadataFilters]
- Update CI to run ty check on tests

* Format code with ruff

* Format searcher_base.py
2025-12-24 23:58:06 -08:00

98 lines
2.9 KiB
Python

"""
Basic functionality tests for CI pipeline using pytest.
"""
import os
import tempfile
from pathlib import Path
import pytest
def test_imports():
"""Test that all packages can be imported."""
# Test C++ extensions
@pytest.mark.skipif(
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
)
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
def test_backend_basic(backend_name):
"""Test basic functionality for each backend."""
from leann.api import LeannBuilder, LeannSearcher, SearchResult
# Create temporary directory for index
with tempfile.TemporaryDirectory() as temp_dir:
index_path = str(Path(temp_dir) / f"test.{backend_name}")
# Test with small data
texts = [f"This is document {i} about topic {i % 5}" for i in range(100)]
# Configure builder based on backend
if backend_name == "hnsw":
builder = LeannBuilder(
backend_name="hnsw",
embedding_model="facebook/contriever",
embedding_mode="sentence-transformers",
M=16,
efConstruction=200,
)
else: # diskann
builder = LeannBuilder(
backend_name="diskann",
embedding_model="facebook/contriever",
embedding_mode="sentence-transformers",
num_neighbors=32,
search_list_size=50,
)
# Add texts
for text in texts:
builder.add_text(text)
# Build index
builder.build_index(index_path)
# Test search
searcher = LeannSearcher(index_path)
results = searcher.search("document about topic 2", top_k=5)
# Verify results
assert len(results) > 0
assert isinstance(results[0], SearchResult)
assert "topic 2" in results[0].text or "document" in results[0].text
# Ensure cleanup to avoid hanging background servers
searcher.cleanup()
@pytest.mark.skipif(
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
)
def test_large_index():
"""Test with larger dataset."""
from leann.api import LeannBuilder, LeannSearcher
with tempfile.TemporaryDirectory() as temp_dir:
index_path = str(Path(temp_dir) / "test_large.hnsw")
texts = [f"Document {i}: {' '.join([f'word{j}' for j in range(50)])}" for i in range(1000)]
builder = LeannBuilder(
backend_name="hnsw",
embedding_model="facebook/contriever",
embedding_mode="sentence-transformers",
)
for text in texts:
builder.add_text(text)
builder.build_index(index_path)
searcher = LeannSearcher(index_path)
results = searcher.search("word10 word20", top_k=10)
assert len(results) == 10
# Cleanup
searcher.cleanup()