* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
534 lines
20 KiB
Python
534 lines
20 KiB
Python
"""
|
|
Tests for CLI argument integration of --embedding-prompt-template.
|
|
|
|
These tests verify that:
|
|
1. The --embedding-prompt-template flag is properly registered on build and search commands
|
|
2. The template value flows from CLI args to embedding_options dict
|
|
3. The template is passed through to compute_embeddings() function
|
|
4. Default behavior (no flag) is handled correctly
|
|
"""
|
|
|
|
from unittest.mock import Mock, patch
|
|
|
|
from leann.cli import LeannCLI
|
|
|
|
|
|
class TestCLIPromptTemplateArgument:
|
|
"""Tests for --embedding-prompt-template on build and search commands."""
|
|
|
|
def test_commands_accept_prompt_template_argument(self):
|
|
"""Verify that build and search parsers accept --embedding-prompt-template flag."""
|
|
cli = LeannCLI()
|
|
parser = cli.create_parser()
|
|
template_value = "search_query: "
|
|
|
|
# Test build command
|
|
build_args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
"/tmp/test-docs",
|
|
"--embedding-prompt-template",
|
|
template_value,
|
|
]
|
|
)
|
|
assert build_args.command == "build"
|
|
assert hasattr(build_args, "embedding_prompt_template"), (
|
|
"build command should have embedding_prompt_template attribute"
|
|
)
|
|
assert build_args.embedding_prompt_template == template_value
|
|
|
|
# Test search command
|
|
search_args = parser.parse_args(
|
|
["search", "test-index", "my query", "--embedding-prompt-template", template_value]
|
|
)
|
|
assert search_args.command == "search"
|
|
assert hasattr(search_args, "embedding_prompt_template"), (
|
|
"search command should have embedding_prompt_template attribute"
|
|
)
|
|
assert search_args.embedding_prompt_template == template_value
|
|
|
|
def test_commands_default_to_none(self):
|
|
"""Verify default value is None when flag not provided (backward compatibility)."""
|
|
cli = LeannCLI()
|
|
parser = cli.create_parser()
|
|
|
|
# Test build command default
|
|
build_args = parser.parse_args(["build", "test-index", "--docs", "/tmp/test-docs"])
|
|
assert hasattr(build_args, "embedding_prompt_template"), (
|
|
"build command should have embedding_prompt_template attribute"
|
|
)
|
|
assert build_args.embedding_prompt_template is None, (
|
|
"Build default value should be None when flag not provided"
|
|
)
|
|
|
|
# Test search command default
|
|
search_args = parser.parse_args(["search", "test-index", "my query"])
|
|
assert hasattr(search_args, "embedding_prompt_template"), (
|
|
"search command should have embedding_prompt_template attribute"
|
|
)
|
|
assert search_args.embedding_prompt_template is None, (
|
|
"Search default value should be None when flag not provided"
|
|
)
|
|
|
|
|
|
class TestBuildCommandPromptTemplateArgumentExtras:
|
|
"""Additional build-specific tests for prompt template argument."""
|
|
|
|
def test_build_command_prompt_template_with_multiword_value(self):
|
|
"""
|
|
Verify that template values with spaces are handled correctly.
|
|
|
|
Templates like "search_document: " or "Represent this sentence for searching: "
|
|
should be accepted as a single string argument.
|
|
"""
|
|
cli = LeannCLI()
|
|
parser = cli.create_parser()
|
|
|
|
template = "Represent this sentence for searching: "
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
"/tmp/test-docs",
|
|
"--embedding-prompt-template",
|
|
template,
|
|
]
|
|
)
|
|
|
|
assert args.embedding_prompt_template == template
|
|
|
|
|
|
class TestPromptTemplateStoredInEmbeddingOptions:
|
|
"""Tests for template storage in embedding_options dict."""
|
|
|
|
@patch("leann.cli.LeannBuilder")
|
|
def test_prompt_template_stored_in_embedding_options_on_build(
|
|
self, mock_builder_class, tmp_path
|
|
):
|
|
"""
|
|
Verify that when --embedding-prompt-template is provided to build command,
|
|
the value is stored in embedding_options dict passed to LeannBuilder.
|
|
|
|
This test will fail because the CLI doesn't currently process this argument
|
|
and add it to embedding_options.
|
|
"""
|
|
# Setup mocks
|
|
mock_builder = Mock()
|
|
mock_builder_class.return_value = mock_builder
|
|
|
|
# Create CLI and run build command
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a document so builder is created
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
template = "search_query: "
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
str(tmp_path),
|
|
"--embedding-prompt-template",
|
|
template,
|
|
"--force", # Force rebuild to ensure LeannBuilder is called
|
|
]
|
|
)
|
|
|
|
# Run the build command
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Check that LeannBuilder was called with embedding_options containing prompt_template
|
|
call_kwargs = mock_builder_class.call_args.kwargs
|
|
assert "embedding_options" in call_kwargs, "LeannBuilder should receive embedding_options"
|
|
|
|
embedding_options = call_kwargs["embedding_options"]
|
|
assert embedding_options is not None, (
|
|
"embedding_options should not be None when template provided"
|
|
)
|
|
assert "prompt_template" in embedding_options, (
|
|
"embedding_options should contain 'prompt_template' key"
|
|
)
|
|
assert embedding_options["prompt_template"] == template, (
|
|
f"Template should be '{template}', got {embedding_options.get('prompt_template')}"
|
|
)
|
|
|
|
@patch("leann.cli.LeannBuilder")
|
|
def test_prompt_template_not_in_options_when_not_provided(self, mock_builder_class, tmp_path):
|
|
"""
|
|
Verify that when --embedding-prompt-template is NOT provided,
|
|
embedding_options either doesn't have the key or it's None.
|
|
|
|
This ensures we don't pass empty/None values unnecessarily.
|
|
"""
|
|
# Setup mocks
|
|
mock_builder = Mock()
|
|
mock_builder_class.return_value = mock_builder
|
|
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a document so builder is created
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
str(tmp_path),
|
|
"--force", # Force rebuild to ensure LeannBuilder is called
|
|
]
|
|
)
|
|
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Check that if embedding_options is passed, it doesn't have prompt_template
|
|
call_kwargs = mock_builder_class.call_args.kwargs
|
|
if call_kwargs.get("embedding_options"):
|
|
embedding_options = call_kwargs["embedding_options"]
|
|
# Either the key shouldn't exist, or it should be None
|
|
assert (
|
|
"prompt_template" not in embedding_options
|
|
or embedding_options["prompt_template"] is None
|
|
), "prompt_template should not be set when flag not provided"
|
|
|
|
# R1 Tests: Build-time separate template storage
|
|
@patch("leann.cli.LeannBuilder")
|
|
def test_build_stores_separate_templates(self, mock_builder_class, tmp_path):
|
|
"""
|
|
R1 Test 1: Verify that when both --embedding-prompt-template and
|
|
--query-prompt-template are provided to build command, both values
|
|
are stored separately in embedding_options dict as build_prompt_template
|
|
and query_prompt_template.
|
|
|
|
This test will fail because:
|
|
1. CLI doesn't accept --query-prompt-template flag yet
|
|
2. CLI doesn't store templates as separate build_prompt_template and
|
|
query_prompt_template keys
|
|
|
|
Expected behavior after implementation:
|
|
- .meta.json contains: {"embedding_options": {
|
|
"build_prompt_template": "doc: ",
|
|
"query_prompt_template": "query: "
|
|
}}
|
|
"""
|
|
# Setup mocks
|
|
mock_builder = Mock()
|
|
mock_builder_class.return_value = mock_builder
|
|
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a document so builder is created
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
build_template = "doc: "
|
|
query_template = "query: "
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
str(tmp_path),
|
|
"--embedding-prompt-template",
|
|
build_template,
|
|
"--query-prompt-template",
|
|
query_template,
|
|
"--force",
|
|
]
|
|
)
|
|
|
|
# Run the build command
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Check that LeannBuilder was called with separate template keys
|
|
call_kwargs = mock_builder_class.call_args.kwargs
|
|
assert "embedding_options" in call_kwargs, "LeannBuilder should receive embedding_options"
|
|
|
|
embedding_options = call_kwargs["embedding_options"]
|
|
assert embedding_options is not None, (
|
|
"embedding_options should not be None when templates provided"
|
|
)
|
|
|
|
assert "build_prompt_template" in embedding_options, (
|
|
"embedding_options should contain 'build_prompt_template' key"
|
|
)
|
|
assert embedding_options["build_prompt_template"] == build_template, (
|
|
f"build_prompt_template should be '{build_template}'"
|
|
)
|
|
|
|
assert "query_prompt_template" in embedding_options, (
|
|
"embedding_options should contain 'query_prompt_template' key"
|
|
)
|
|
assert embedding_options["query_prompt_template"] == query_template, (
|
|
f"query_prompt_template should be '{query_template}'"
|
|
)
|
|
|
|
# Old key should NOT be present when using new separate template format
|
|
assert "prompt_template" not in embedding_options, (
|
|
"Old 'prompt_template' key should not be present with separate templates"
|
|
)
|
|
|
|
@patch("leann.cli.LeannBuilder")
|
|
def test_build_backward_compat_single_template(self, mock_builder_class, tmp_path):
|
|
"""
|
|
R1 Test 2: Verify backward compatibility - when only
|
|
--embedding-prompt-template is provided (old behavior), it should
|
|
still be stored as 'prompt_template' in embedding_options.
|
|
|
|
This ensures existing workflows continue to work unchanged.
|
|
|
|
This test currently passes because it matches existing behavior, but it
|
|
documents the requirement that this behavior must be preserved after
|
|
implementing the separate template feature.
|
|
|
|
Expected behavior:
|
|
- .meta.json contains: {"embedding_options": {"prompt_template": "prompt: "}}
|
|
- No build_prompt_template or query_prompt_template keys
|
|
"""
|
|
# Setup mocks
|
|
mock_builder = Mock()
|
|
mock_builder_class.return_value = mock_builder
|
|
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a document so builder is created
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
template = "prompt: "
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
str(tmp_path),
|
|
"--embedding-prompt-template",
|
|
template,
|
|
"--force",
|
|
]
|
|
)
|
|
|
|
# Run the build command
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Check that LeannBuilder was called with old format
|
|
call_kwargs = mock_builder_class.call_args.kwargs
|
|
assert "embedding_options" in call_kwargs, "LeannBuilder should receive embedding_options"
|
|
|
|
embedding_options = call_kwargs["embedding_options"]
|
|
assert embedding_options is not None, (
|
|
"embedding_options should not be None when template provided"
|
|
)
|
|
|
|
assert "prompt_template" in embedding_options, (
|
|
"embedding_options should contain old 'prompt_template' key for backward compat"
|
|
)
|
|
assert embedding_options["prompt_template"] == template, (
|
|
f"prompt_template should be '{template}'"
|
|
)
|
|
|
|
# New keys should NOT be present in backward compat mode
|
|
assert "build_prompt_template" not in embedding_options, (
|
|
"build_prompt_template should not be present with single template flag"
|
|
)
|
|
assert "query_prompt_template" not in embedding_options, (
|
|
"query_prompt_template should not be present with single template flag"
|
|
)
|
|
|
|
@patch("leann.cli.LeannBuilder")
|
|
def test_build_no_templates(self, mock_builder_class, tmp_path):
|
|
"""
|
|
R1 Test 3: Verify that when no template flags are provided,
|
|
embedding_options has no prompt template keys.
|
|
|
|
This ensures clean defaults and no unnecessary keys in .meta.json.
|
|
|
|
This test currently passes because it matches existing behavior, but it
|
|
documents the requirement that this behavior must be preserved after
|
|
implementing the separate template feature.
|
|
|
|
Expected behavior:
|
|
- .meta.json has no prompt_template, build_prompt_template, or
|
|
query_prompt_template keys (or embedding_options is empty/None)
|
|
"""
|
|
# Setup mocks
|
|
mock_builder = Mock()
|
|
mock_builder_class.return_value = mock_builder
|
|
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a document so builder is created
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
args = parser.parse_args(["build", "test-index", "--docs", str(tmp_path), "--force"])
|
|
|
|
# Run the build command
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Check that no template keys are present
|
|
call_kwargs = mock_builder_class.call_args.kwargs
|
|
if call_kwargs.get("embedding_options"):
|
|
embedding_options = call_kwargs["embedding_options"]
|
|
|
|
# None of the template keys should be present
|
|
assert "prompt_template" not in embedding_options, (
|
|
"prompt_template should not be present when no flags provided"
|
|
)
|
|
assert "build_prompt_template" not in embedding_options, (
|
|
"build_prompt_template should not be present when no flags provided"
|
|
)
|
|
assert "query_prompt_template" not in embedding_options, (
|
|
"query_prompt_template should not be present when no flags provided"
|
|
)
|
|
|
|
|
|
class TestPromptTemplateFlowsToComputeEmbeddings:
|
|
"""Tests for template flowing through to compute_embeddings function."""
|
|
|
|
@patch("leann.api.compute_embeddings")
|
|
def test_prompt_template_flows_to_compute_embeddings_via_provider_options(
|
|
self, mock_compute_embeddings, tmp_path
|
|
):
|
|
"""
|
|
Verify that the prompt template flows from CLI args through LeannBuilder
|
|
to compute_embeddings() function via provider_options parameter.
|
|
|
|
This is an integration test that verifies the complete flow:
|
|
CLI → embedding_options → LeannBuilder → compute_embeddings(provider_options)
|
|
|
|
This test will fail because:
|
|
1. CLI doesn't capture the argument yet
|
|
2. embedding_options doesn't include prompt_template
|
|
3. LeannBuilder doesn't pass it through to compute_embeddings
|
|
"""
|
|
# Mock compute_embeddings to return dummy embeddings as numpy array
|
|
import numpy as np
|
|
|
|
mock_compute_embeddings.return_value = np.array([[0.1, 0.2, 0.3]], dtype=np.float32)
|
|
|
|
# Use real LeannBuilder (not mocked) to test the actual flow
|
|
cli = LeannCLI()
|
|
|
|
# Mock load_documents to return a simple document
|
|
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
|
|
|
|
parser = cli.create_parser()
|
|
|
|
template = "search_document: "
|
|
args = parser.parse_args(
|
|
[
|
|
"build",
|
|
"test-index",
|
|
"--docs",
|
|
str(tmp_path),
|
|
"--embedding-prompt-template",
|
|
template,
|
|
"--backend-name",
|
|
"hnsw", # Use hnsw backend
|
|
"--force", # Force rebuild to ensure index is created
|
|
]
|
|
)
|
|
|
|
# This should fail because the flow isn't implemented yet
|
|
import asyncio
|
|
|
|
asyncio.run(cli.build_index(args))
|
|
|
|
# Verify compute_embeddings was called with provider_options containing prompt_template
|
|
assert mock_compute_embeddings.called, "compute_embeddings should have been called"
|
|
|
|
# Check the call arguments
|
|
call_kwargs = mock_compute_embeddings.call_args.kwargs
|
|
assert "provider_options" in call_kwargs, (
|
|
"compute_embeddings should receive provider_options parameter"
|
|
)
|
|
|
|
provider_options = call_kwargs["provider_options"]
|
|
assert provider_options is not None, "provider_options should not be None"
|
|
assert "prompt_template" in provider_options, (
|
|
"provider_options should contain prompt_template key"
|
|
)
|
|
assert provider_options["prompt_template"] == template, (
|
|
f"Template should be '{template}', got {provider_options.get('prompt_template')}"
|
|
)
|
|
|
|
|
|
class TestPromptTemplateArgumentHelp:
|
|
"""Tests for argument help text and documentation."""
|
|
|
|
def test_build_command_prompt_template_has_help_text(self):
|
|
"""
|
|
Verify that --embedding-prompt-template has descriptive help text.
|
|
|
|
Good help text is crucial for CLI usability.
|
|
"""
|
|
cli = LeannCLI()
|
|
parser = cli.create_parser()
|
|
|
|
# Get the build subparser
|
|
# This is a bit tricky - we need to parse to get the help
|
|
# We'll check that the help includes relevant keywords
|
|
import io
|
|
from contextlib import redirect_stdout
|
|
|
|
f = io.StringIO()
|
|
try:
|
|
with redirect_stdout(f):
|
|
parser.parse_args(["build", "--help"])
|
|
except SystemExit:
|
|
pass # --help causes sys.exit(0)
|
|
|
|
help_text = f.getvalue()
|
|
assert "--embedding-prompt-template" in help_text, (
|
|
"Help text should mention --embedding-prompt-template"
|
|
)
|
|
# Check for keywords that should be in the help
|
|
help_lower = help_text.lower()
|
|
assert any(keyword in help_lower for keyword in ["template", "prompt", "prepend"]), (
|
|
"Help text should explain what the prompt template does"
|
|
)
|
|
|
|
def test_search_command_prompt_template_has_help_text(self):
|
|
"""
|
|
Verify that search command also has help text for --embedding-prompt-template.
|
|
"""
|
|
cli = LeannCLI()
|
|
parser = cli.create_parser()
|
|
|
|
import io
|
|
from contextlib import redirect_stdout
|
|
|
|
f = io.StringIO()
|
|
try:
|
|
with redirect_stdout(f):
|
|
parser.parse_args(["search", "--help"])
|
|
except SystemExit:
|
|
pass # --help causes sys.exit(0)
|
|
|
|
help_text = f.getvalue()
|
|
assert "--embedding-prompt-template" in help_text, (
|
|
"Search help text should mention --embedding-prompt-template"
|
|
)
|