* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
185 lines
7.5 KiB
Python
185 lines
7.5 KiB
Python
"""
|
|
Test examples from README.md to ensure documentation is accurate.
|
|
"""
|
|
|
|
import os
|
|
import platform
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
|
def test_readme_basic_example(backend_name):
|
|
"""Test the basic example from README.md with both backends."""
|
|
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
# Skip DiskANN on CI (Linux runners) due to C++ extension memory/hardware constraints
|
|
if os.environ.get("CI") == "true" and backend_name == "diskann":
|
|
pytest.skip("Skip DiskANN tests in CI due to resource constraints and instability")
|
|
|
|
# This is the exact code from README (with smaller model for CI)
|
|
from leann import LeannBuilder, LeannChat, LeannSearcher
|
|
from leann.api import SearchResult
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
INDEX_PATH = str(Path(temp_dir) / f"demo_{backend_name}.leann")
|
|
|
|
# Build an index
|
|
# In CI, use a smaller model to avoid memory issues
|
|
if os.environ.get("CI") == "true":
|
|
builder = LeannBuilder(
|
|
backend_name=backend_name,
|
|
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Smaller model
|
|
dimensions=384, # Smaller dimensions
|
|
)
|
|
else:
|
|
builder = LeannBuilder(backend_name=backend_name)
|
|
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
|
|
builder.add_text("Tung Tung Tung Sahur called—they need their banana-crocodile hybrid back")
|
|
builder.build_index(INDEX_PATH)
|
|
|
|
# Verify index was created
|
|
# The index path should be a directory containing index files
|
|
index_dir = Path(INDEX_PATH).parent
|
|
assert index_dir.exists()
|
|
# Check that index files were created
|
|
index_files = list(index_dir.glob(f"{Path(INDEX_PATH).stem}.*"))
|
|
assert len(index_files) > 0
|
|
|
|
# Search
|
|
searcher = LeannSearcher(INDEX_PATH)
|
|
results = searcher.search("fantastical AI-generated creatures", top_k=1)
|
|
|
|
# Verify search results
|
|
assert len(results) > 0
|
|
assert isinstance(results[0], SearchResult)
|
|
assert results[0].score != float("-inf"), (
|
|
f"should return valid scores, got {results[0].score}"
|
|
)
|
|
# The second text about banana-crocodile should be more relevant
|
|
assert "banana" in results[0].text or "crocodile" in results[0].text
|
|
|
|
# Ensure we cleanup background embedding server
|
|
searcher.cleanup()
|
|
|
|
# Chat with your data (using simulated LLM to avoid external dependencies)
|
|
chat = LeannChat(INDEX_PATH, llm_config={"type": "simulated"})
|
|
response = chat.ask("How much storage does LEANN save?", top_k=1)
|
|
|
|
# Verify chat works
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
# Cleanup chat resources
|
|
chat.cleanup()
|
|
|
|
|
|
def test_readme_imports():
|
|
"""Test that the imports shown in README work correctly."""
|
|
# These are the imports shown in README
|
|
from leann import LeannBuilder, LeannChat, LeannSearcher
|
|
|
|
# Verify they are the correct types
|
|
assert callable(LeannBuilder)
|
|
assert callable(LeannSearcher)
|
|
assert callable(LeannChat)
|
|
|
|
|
|
def test_backend_options():
|
|
"""Test different backend options mentioned in documentation."""
|
|
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
|
|
from leann import LeannBuilder
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Use smaller model in CI to avoid memory issues
|
|
is_ci = os.environ.get("CI") == "true"
|
|
embedding_model = (
|
|
"sentence-transformers/all-MiniLM-L6-v2" if is_ci else "facebook/contriever"
|
|
)
|
|
dimensions = 384 if is_ci else None
|
|
|
|
# Test HNSW backend (as shown in README)
|
|
hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
|
|
builder_hnsw = LeannBuilder(
|
|
backend_name="hnsw", embedding_model=embedding_model, dimensions=dimensions
|
|
)
|
|
builder_hnsw.add_text("Test document for HNSW backend")
|
|
builder_hnsw.build_index(hnsw_path)
|
|
assert Path(hnsw_path).parent.exists()
|
|
assert len(list(Path(hnsw_path).parent.glob(f"{Path(hnsw_path).stem}.*"))) > 0
|
|
|
|
# Test DiskANN backend (mentioned as available option)
|
|
diskann_path = str(Path(temp_dir) / "test_diskann.leann")
|
|
builder_diskann = LeannBuilder(
|
|
backend_name="diskann", embedding_model=embedding_model, dimensions=dimensions
|
|
)
|
|
builder_diskann.add_text("Test document for DiskANN backend")
|
|
builder_diskann.build_index(diskann_path)
|
|
assert Path(diskann_path).parent.exists()
|
|
assert len(list(Path(diskann_path).parent.glob(f"{Path(diskann_path).stem}.*"))) > 0
|
|
|
|
|
|
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
|
def test_llm_config_simulated(backend_name):
|
|
"""Test simulated LLM configuration option with both backends."""
|
|
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
|
|
# Skip DiskANN tests in CI due to hardware requirements
|
|
if os.environ.get("CI") == "true" and backend_name == "diskann":
|
|
pytest.skip("Skip DiskANN tests in CI - requires specific hardware and large memory")
|
|
|
|
from leann import LeannBuilder, LeannChat
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Build a simple index
|
|
index_path = str(Path(temp_dir) / f"test_{backend_name}.leann")
|
|
# Use smaller model in CI to avoid memory issues
|
|
if os.environ.get("CI") == "true":
|
|
builder = LeannBuilder(
|
|
backend_name=backend_name,
|
|
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
|
dimensions=384,
|
|
)
|
|
else:
|
|
builder = LeannBuilder(backend_name=backend_name)
|
|
builder.add_text("Test document for LLM testing")
|
|
builder.build_index(index_path)
|
|
|
|
# Test simulated LLM config
|
|
llm_config = {"type": "simulated"}
|
|
chat = LeannChat(index_path, llm_config=llm_config)
|
|
response = chat.ask("What is this document about?", top_k=1)
|
|
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|
|
|
|
|
|
@pytest.mark.skip(reason="Requires HF model download and may timeout")
|
|
def test_llm_config_hf():
|
|
"""Test HuggingFace LLM configuration option."""
|
|
from leann import LeannBuilder, LeannChat
|
|
|
|
pytest.importorskip("transformers") # Skip if transformers not installed
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Build a simple index
|
|
index_path = str(Path(temp_dir) / "test.leann")
|
|
builder = LeannBuilder(backend_name="hnsw")
|
|
builder.add_text("Test document for LLM testing")
|
|
builder.build_index(index_path)
|
|
|
|
# Test HF LLM config
|
|
llm_config = {"type": "hf", "model": "Qwen/Qwen3-0.6B"}
|
|
chat = LeannChat(index_path, llm_config=llm_config)
|
|
response = chat.ask("What is this document about?", top_k=1)
|
|
|
|
assert isinstance(response, str)
|
|
assert len(response) > 0
|