Files
LEANN/tests/test_readme_examples.py
Andy Lee 198044d033 Add ty type checker to CI and fix type errors (fixes bug from PR #157) (#192)
* Add ty type checker to CI and fix type errors

- Add ty (Astral's fast Python type checker) to GitHub CI workflow
- Fix type annotations across all RAG apps:
  - Update load_data return types from list[str] to list[dict[str, Any]]
  - Fix base_rag_example.py to properly handle dict format from create_text_chunks
- Fix type errors in leann-core:
  - chunking_utils.py: Add explicit type annotations
  - cli.py: Fix return type annotations for PDF extraction functions
  - interactive_utils.py: Fix readline import type handling
- Fix type errors in apps:
  - wechat_history.py: Fix return type annotations
  - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments
- Add ty configuration to pyproject.toml

This resolves the bug introduced in PR #157 where create_text_chunks()
changed to return list[dict] but callers were not updated.

* Fix remaining ty type errors

- Fix slack_mcp_reader.py channel parameter can be None
- Fix embedding_compute.py ContextProp type issue
- Fix searcher_base.py method override signatures
- Fix chunking_utils.py chunk_text assignment
- Fix slack_rag.py and twitter_rag.py return types
- Fix email.py and image_rag.py method overrides

* Fix multimodal benchmark scripts type errors

- Fix undefined LeannRetriever -> LeannMultiVector
- Add proper type casts for HuggingFace Dataset iteration
- Cast task config values to correct types
- Add type annotations for dataset row dicts

* Enable ty check for multimodal scripts in CI

All type errors in multimodal scripts have been fixed, so we can now
include them in the CI type checking.

* Fix all test type errors and enable ty check on tests

- Fix test_basic.py: search() takes str not list
- Fix test_cli_prompt_template.py: add type: ignore for Mock assignments
- Fix test_prompt_template_persistence.py: match BaseSearcher.search signature
- Fix test_prompt_template_e2e.py: add type narrowing asserts after skip
- Fix test_readme_examples.py: use explicit kwargs instead of **model_args
- Fix metadata_filter.py: allow Optional[MetadataFilters]
- Update CI to run ty check on tests

* Format code with ruff

* Format searcher_base.py
2025-12-24 23:58:06 -08:00

185 lines
7.5 KiB
Python

"""
Test examples from README.md to ensure documentation is accurate.
"""
import os
import platform
import tempfile
from pathlib import Path
import pytest
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
def test_readme_basic_example(backend_name):
"""Test the basic example from README.md with both backends."""
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
# Skip DiskANN on CI (Linux runners) due to C++ extension memory/hardware constraints
if os.environ.get("CI") == "true" and backend_name == "diskann":
pytest.skip("Skip DiskANN tests in CI due to resource constraints and instability")
# This is the exact code from README (with smaller model for CI)
from leann import LeannBuilder, LeannChat, LeannSearcher
from leann.api import SearchResult
with tempfile.TemporaryDirectory() as temp_dir:
INDEX_PATH = str(Path(temp_dir) / f"demo_{backend_name}.leann")
# Build an index
# In CI, use a smaller model to avoid memory issues
if os.environ.get("CI") == "true":
builder = LeannBuilder(
backend_name=backend_name,
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Smaller model
dimensions=384, # Smaller dimensions
)
else:
builder = LeannBuilder(backend_name=backend_name)
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
builder.add_text("Tung Tung Tung Sahur called—they need their banana-crocodile hybrid back")
builder.build_index(INDEX_PATH)
# Verify index was created
# The index path should be a directory containing index files
index_dir = Path(INDEX_PATH).parent
assert index_dir.exists()
# Check that index files were created
index_files = list(index_dir.glob(f"{Path(INDEX_PATH).stem}.*"))
assert len(index_files) > 0
# Search
searcher = LeannSearcher(INDEX_PATH)
results = searcher.search("fantastical AI-generated creatures", top_k=1)
# Verify search results
assert len(results) > 0
assert isinstance(results[0], SearchResult)
assert results[0].score != float("-inf"), (
f"should return valid scores, got {results[0].score}"
)
# The second text about banana-crocodile should be more relevant
assert "banana" in results[0].text or "crocodile" in results[0].text
# Ensure we cleanup background embedding server
searcher.cleanup()
# Chat with your data (using simulated LLM to avoid external dependencies)
chat = LeannChat(INDEX_PATH, llm_config={"type": "simulated"})
response = chat.ask("How much storage does LEANN save?", top_k=1)
# Verify chat works
assert isinstance(response, str)
assert len(response) > 0
# Cleanup chat resources
chat.cleanup()
def test_readme_imports():
"""Test that the imports shown in README work correctly."""
# These are the imports shown in README
from leann import LeannBuilder, LeannChat, LeannSearcher
# Verify they are the correct types
assert callable(LeannBuilder)
assert callable(LeannSearcher)
assert callable(LeannChat)
def test_backend_options():
"""Test different backend options mentioned in documentation."""
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
from leann import LeannBuilder
with tempfile.TemporaryDirectory() as temp_dir:
# Use smaller model in CI to avoid memory issues
is_ci = os.environ.get("CI") == "true"
embedding_model = (
"sentence-transformers/all-MiniLM-L6-v2" if is_ci else "facebook/contriever"
)
dimensions = 384 if is_ci else None
# Test HNSW backend (as shown in README)
hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
builder_hnsw = LeannBuilder(
backend_name="hnsw", embedding_model=embedding_model, dimensions=dimensions
)
builder_hnsw.add_text("Test document for HNSW backend")
builder_hnsw.build_index(hnsw_path)
assert Path(hnsw_path).parent.exists()
assert len(list(Path(hnsw_path).parent.glob(f"{Path(hnsw_path).stem}.*"))) > 0
# Test DiskANN backend (mentioned as available option)
diskann_path = str(Path(temp_dir) / "test_diskann.leann")
builder_diskann = LeannBuilder(
backend_name="diskann", embedding_model=embedding_model, dimensions=dimensions
)
builder_diskann.add_text("Test document for DiskANN backend")
builder_diskann.build_index(diskann_path)
assert Path(diskann_path).parent.exists()
assert len(list(Path(diskann_path).parent.glob(f"{Path(diskann_path).stem}.*"))) > 0
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
def test_llm_config_simulated(backend_name):
"""Test simulated LLM configuration option with both backends."""
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
# Skip DiskANN tests in CI due to hardware requirements
if os.environ.get("CI") == "true" and backend_name == "diskann":
pytest.skip("Skip DiskANN tests in CI - requires specific hardware and large memory")
from leann import LeannBuilder, LeannChat
with tempfile.TemporaryDirectory() as temp_dir:
# Build a simple index
index_path = str(Path(temp_dir) / f"test_{backend_name}.leann")
# Use smaller model in CI to avoid memory issues
if os.environ.get("CI") == "true":
builder = LeannBuilder(
backend_name=backend_name,
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
dimensions=384,
)
else:
builder = LeannBuilder(backend_name=backend_name)
builder.add_text("Test document for LLM testing")
builder.build_index(index_path)
# Test simulated LLM config
llm_config = {"type": "simulated"}
chat = LeannChat(index_path, llm_config=llm_config)
response = chat.ask("What is this document about?", top_k=1)
assert isinstance(response, str)
assert len(response) > 0
@pytest.mark.skip(reason="Requires HF model download and may timeout")
def test_llm_config_hf():
"""Test HuggingFace LLM configuration option."""
from leann import LeannBuilder, LeannChat
pytest.importorskip("transformers") # Skip if transformers not installed
with tempfile.TemporaryDirectory() as temp_dir:
# Build a simple index
index_path = str(Path(temp_dir) / "test.leann")
builder = LeannBuilder(backend_name="hnsw")
builder.add_text("Test document for LLM testing")
builder.build_index(index_path)
# Test HF LLM config
llm_config = {"type": "hf", "model": "Qwen/Qwen3-0.6B"}
chat = LeannChat(index_path, llm_config=llm_config)
response = chat.ask("What is this document about?", top_k=1)
assert isinstance(response, str)
assert len(response) > 0