* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
"""
|
|
Basic functionality tests for CI pipeline using pytest.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
def test_imports():
|
|
"""Test that all packages can be imported."""
|
|
|
|
# Test C++ extensions
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
)
|
|
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
|
def test_backend_basic(backend_name):
|
|
"""Test basic functionality for each backend."""
|
|
from leann.api import LeannBuilder, LeannSearcher, SearchResult
|
|
|
|
# Create temporary directory for index
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
index_path = str(Path(temp_dir) / f"test.{backend_name}")
|
|
|
|
# Test with small data
|
|
texts = [f"This is document {i} about topic {i % 5}" for i in range(100)]
|
|
|
|
# Configure builder based on backend
|
|
if backend_name == "hnsw":
|
|
builder = LeannBuilder(
|
|
backend_name="hnsw",
|
|
embedding_model="facebook/contriever",
|
|
embedding_mode="sentence-transformers",
|
|
M=16,
|
|
efConstruction=200,
|
|
)
|
|
else: # diskann
|
|
builder = LeannBuilder(
|
|
backend_name="diskann",
|
|
embedding_model="facebook/contriever",
|
|
embedding_mode="sentence-transformers",
|
|
num_neighbors=32,
|
|
search_list_size=50,
|
|
)
|
|
|
|
# Add texts
|
|
for text in texts:
|
|
builder.add_text(text)
|
|
|
|
# Build index
|
|
builder.build_index(index_path)
|
|
|
|
# Test search
|
|
searcher = LeannSearcher(index_path)
|
|
results = searcher.search("document about topic 2", top_k=5)
|
|
|
|
# Verify results
|
|
assert len(results) > 0
|
|
assert isinstance(results[0], SearchResult)
|
|
assert "topic 2" in results[0].text or "document" in results[0].text
|
|
|
|
# Ensure cleanup to avoid hanging background servers
|
|
searcher.cleanup()
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
)
|
|
def test_large_index():
|
|
"""Test with larger dataset."""
|
|
from leann.api import LeannBuilder, LeannSearcher
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
index_path = str(Path(temp_dir) / "test_large.hnsw")
|
|
texts = [f"Document {i}: {' '.join([f'word{j}' for j in range(50)])}" for i in range(1000)]
|
|
|
|
builder = LeannBuilder(
|
|
backend_name="hnsw",
|
|
embedding_model="facebook/contriever",
|
|
embedding_mode="sentence-transformers",
|
|
)
|
|
|
|
for text in texts:
|
|
builder.add_text(text)
|
|
|
|
builder.build_index(index_path)
|
|
|
|
searcher = LeannSearcher(index_path)
|
|
results = searcher.search("word10 word20", top_k=10)
|
|
assert len(results) == 10
|
|
# Cleanup
|
|
searcher.cleanup()
|