* feat: enhance token limits with dynamic discovery + AST metadata Improves upon upstream PR #154 with two major enhancements: 1. **Hybrid Token Limit Discovery** - Dynamic: Query Ollama /api/show for context limits - Fallback: Registry for LM Studio/OpenAI - Zero maintenance for Ollama users - Respects custom num_ctx settings 2. **AST Metadata Preservation** - create_ast_chunks() returns dict format with metadata - Preserves file_path, file_name, timestamps - Includes astchunk metadata (line numbers, node counts) - Fixes content extraction bug (checks "content" key) - Enables --show-metadata flag 3. **Better Token Limits** - nomic-embed-text: 2048 tokens (vs 512) - nomic-embed-text-v1.5: 2048 tokens - Added OpenAI models: 8192 tokens 4. **Comprehensive Tests** - 11 tests for token truncation - 545 new lines in test_astchunk_integration.py - All metadata preservation tests passing * fix: merge EMBEDDING_MODEL_LIMITS and remove redundant validation - Merged upstream's model list with our corrected token limits - Kept our corrected nomic-embed-text: 2048 (not 512) - Removed post-chunking validation (redundant with embedding-time truncation) - All tests passing except 2 pre-existing integration test failures * style: apply ruff formatting and restore PR #154 version handling - Remove duplicate truncate_to_token_limit and get_model_token_limit functions - Restore version handling logic (model:latest -> model) from PR #154 - Restore partial matching fallback for model name variations - Apply ruff formatting to all modified files - All 11 token truncation tests passing * style: sort imports alphabetically (pre-commit auto-fix) * fix: show AST token limit warning only once per session - Add module-level flag to track if warning shown - Prevents spam when processing multiple files - Add clarifying note that auto-truncation happens at embedding time - Addresses issue where warning appeared for every code file * enhance: add detailed logging for token truncation - Track and report truncation statistics (count, tokens removed, max length) - Show first 3 individual truncations with exact token counts - Provide comprehensive summary when truncation occurs - Use WARNING level for data loss visibility - Silent (DEBUG level only) when no truncation needed Replaces misleading "truncated where necessary" message that appeared even when nothing was truncated.
965 lines
36 KiB
Python
965 lines
36 KiB
Python
"""
|
|
Test suite for astchunk integration with LEANN.
|
|
Tests AST-aware chunking functionality, language detection, and fallback mechanisms.
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
|
|
# Add apps directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "apps"))
|
|
|
|
from typing import Optional
|
|
|
|
from chunking import (
|
|
create_ast_chunks,
|
|
create_text_chunks,
|
|
create_traditional_chunks,
|
|
detect_code_files,
|
|
get_language_from_extension,
|
|
)
|
|
|
|
|
|
class MockDocument:
|
|
"""Mock LlamaIndex Document for testing."""
|
|
|
|
def __init__(self, content: str, file_path: str = "", metadata: Optional[dict] = None):
|
|
self.content = content
|
|
self.metadata = metadata or {}
|
|
if file_path:
|
|
self.metadata["file_path"] = file_path
|
|
|
|
def get_content(self) -> str:
|
|
return self.content
|
|
|
|
|
|
class TestCodeFileDetection:
|
|
"""Test code file detection and language mapping."""
|
|
|
|
def test_detect_code_files_python(self):
|
|
"""Test detection of Python files."""
|
|
docs = [
|
|
MockDocument("print('hello')", "/path/to/file.py"),
|
|
MockDocument("This is text", "/path/to/file.txt"),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 1
|
|
assert len(text_docs) == 1
|
|
assert code_docs[0].metadata["language"] == "python"
|
|
assert code_docs[0].metadata["is_code"] is True
|
|
assert text_docs[0].metadata["is_code"] is False
|
|
|
|
def test_detect_code_files_multiple_languages(self):
|
|
"""Test detection of multiple programming languages."""
|
|
docs = [
|
|
MockDocument("def func():", "/path/to/script.py"),
|
|
MockDocument("public class Test {}", "/path/to/Test.java"),
|
|
MockDocument("interface ITest {}", "/path/to/test.ts"),
|
|
MockDocument("using System;", "/path/to/Program.cs"),
|
|
MockDocument("Regular text content", "/path/to/document.txt"),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 4
|
|
assert len(text_docs) == 1
|
|
|
|
languages = [doc.metadata["language"] for doc in code_docs]
|
|
assert "python" in languages
|
|
assert "java" in languages
|
|
assert "typescript" in languages
|
|
assert "csharp" in languages
|
|
|
|
def test_detect_code_files_no_file_path(self):
|
|
"""Test handling of documents without file paths."""
|
|
docs = [
|
|
MockDocument("some content"),
|
|
MockDocument("other content", metadata={"some_key": "value"}),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 0
|
|
assert len(text_docs) == 2
|
|
for doc in text_docs:
|
|
assert doc.metadata["is_code"] is False
|
|
|
|
def test_get_language_from_extension(self):
|
|
"""Test language detection from file extensions."""
|
|
assert get_language_from_extension("test.py") == "python"
|
|
assert get_language_from_extension("Test.java") == "java"
|
|
assert get_language_from_extension("component.tsx") == "typescript"
|
|
assert get_language_from_extension("Program.cs") == "csharp"
|
|
assert get_language_from_extension("document.txt") is None
|
|
assert get_language_from_extension("") is None
|
|
|
|
|
|
class TestChunkingFunctions:
|
|
"""Test various chunking functionality."""
|
|
|
|
def test_create_traditional_chunks(self):
|
|
"""Test traditional text chunking."""
|
|
docs = [
|
|
MockDocument(
|
|
"This is a test document. It has multiple sentences. We want to test chunking."
|
|
)
|
|
]
|
|
|
|
chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
|
|
|
|
assert len(chunks) > 0
|
|
# Traditional chunks now return dict format for consistency
|
|
assert all(isinstance(chunk, dict) for chunk in chunks)
|
|
assert all("text" in chunk and "metadata" in chunk for chunk in chunks)
|
|
assert all(len(chunk["text"].strip()) > 0 for chunk in chunks)
|
|
|
|
def test_create_traditional_chunks_empty_docs(self):
|
|
"""Test traditional chunking with empty documents."""
|
|
chunks = create_traditional_chunks([], chunk_size=50, chunk_overlap=10)
|
|
assert chunks == []
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip astchunk tests in CI - dependency may not be available",
|
|
)
|
|
def test_create_ast_chunks_with_astchunk_available(self):
|
|
"""Test AST chunking when astchunk is available."""
|
|
python_code = '''
|
|
def hello_world():
|
|
"""Print hello world message."""
|
|
print("Hello, World!")
|
|
|
|
def add_numbers(a, b):
|
|
"""Add two numbers and return the result."""
|
|
return a + b
|
|
|
|
class Calculator:
|
|
"""A simple calculator class."""
|
|
|
|
def __init__(self):
|
|
self.history = []
|
|
|
|
def add(self, a, b):
|
|
result = a + b
|
|
self.history.append(f"{a} + {b} = {result}")
|
|
return result
|
|
'''
|
|
|
|
docs = [MockDocument(python_code, "/test/calculator.py", {"language": "python"})]
|
|
|
|
try:
|
|
chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50)
|
|
|
|
# Should have multiple chunks due to different functions/classes
|
|
assert len(chunks) > 0
|
|
# R3: Expect dict format with "text" and "metadata" keys
|
|
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
|
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
|
"Each chunk should have 'text' and 'metadata' keys"
|
|
)
|
|
assert all(len(chunk["text"].strip()) > 0 for chunk in chunks), (
|
|
"Each chunk text should be non-empty"
|
|
)
|
|
|
|
# Check metadata is present
|
|
assert all("file_path" in chunk["metadata"] for chunk in chunks), (
|
|
"Each chunk should have file_path metadata"
|
|
)
|
|
|
|
# Check that code structure is somewhat preserved
|
|
combined_content = " ".join([c["text"] for c in chunks])
|
|
assert "def hello_world" in combined_content
|
|
assert "class Calculator" in combined_content
|
|
|
|
except ImportError:
|
|
# astchunk not available, should fall back to traditional chunking
|
|
chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50)
|
|
assert len(chunks) > 0 # Should still get chunks from fallback
|
|
|
|
def test_create_ast_chunks_fallback_to_traditional(self):
|
|
"""Test AST chunking falls back to traditional when astchunk is not available."""
|
|
docs = [MockDocument("def test(): pass", "/test/script.py", {"language": "python"})]
|
|
|
|
# Mock astchunk import to fail
|
|
with patch("chunking.create_ast_chunks"):
|
|
# First call (actual test) should import astchunk and potentially fail
|
|
# Let's call the actual function to test the import error handling
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should return some chunks (either from astchunk or fallback)
|
|
assert isinstance(chunks, list)
|
|
|
|
def test_create_text_chunks_traditional_mode(self):
|
|
"""Test text chunking in traditional mode."""
|
|
docs = [
|
|
MockDocument("def test(): pass", "/test/script.py"),
|
|
MockDocument("This is regular text.", "/test/doc.txt"),
|
|
]
|
|
|
|
chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10)
|
|
|
|
assert len(chunks) > 0
|
|
# R3: Traditional chunking should also return dict format for consistency
|
|
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
|
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
|
"Each chunk should have 'text' and 'metadata' keys"
|
|
)
|
|
|
|
def test_create_text_chunks_ast_mode(self):
|
|
"""Test text chunking in AST mode."""
|
|
docs = [
|
|
MockDocument("def test(): pass", "/test/script.py"),
|
|
MockDocument("This is regular text.", "/test/doc.txt"),
|
|
]
|
|
|
|
chunks = create_text_chunks(
|
|
docs,
|
|
use_ast_chunking=True,
|
|
ast_chunk_size=100,
|
|
ast_chunk_overlap=20,
|
|
chunk_size=50,
|
|
chunk_overlap=10,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
# R3: AST mode should also return dict format
|
|
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
|
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
|
"Each chunk should have 'text' and 'metadata' keys"
|
|
)
|
|
|
|
def test_create_text_chunks_custom_extensions(self):
|
|
"""Test text chunking with custom code file extensions."""
|
|
docs = [
|
|
MockDocument("function test() {}", "/test/script.js"), # Not in default extensions
|
|
MockDocument("Regular text", "/test/doc.txt"),
|
|
]
|
|
|
|
# First without custom extensions - should treat .js as text
|
|
chunks_without = create_text_chunks(docs, use_ast_chunking=True, code_file_extensions=None)
|
|
|
|
# Then with custom extensions - should treat .js as code
|
|
chunks_with = create_text_chunks(
|
|
docs, use_ast_chunking=True, code_file_extensions=[".js", ".jsx"]
|
|
)
|
|
|
|
# Both should return chunks
|
|
assert len(chunks_without) > 0
|
|
assert len(chunks_with) > 0
|
|
|
|
|
|
class TestIntegrationWithDocumentRAG:
|
|
"""Integration tests with the document RAG system."""
|
|
|
|
@pytest.fixture
|
|
def temp_code_dir(self):
|
|
"""Create a temporary directory with sample code files."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Create sample Python file
|
|
python_file = temp_path / "example.py"
|
|
python_file.write_text('''
|
|
def fibonacci(n):
|
|
"""Calculate fibonacci number."""
|
|
if n <= 1:
|
|
return n
|
|
return fibonacci(n-1) + fibonacci(n-2)
|
|
|
|
class MathUtils:
|
|
@staticmethod
|
|
def factorial(n):
|
|
if n <= 1:
|
|
return 1
|
|
return n * MathUtils.factorial(n-1)
|
|
''')
|
|
|
|
# Create sample text file
|
|
text_file = temp_path / "readme.txt"
|
|
text_file.write_text("This is a sample text file for testing purposes.")
|
|
|
|
yield temp_path
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip integration tests in CI to avoid dependency issues",
|
|
)
|
|
def test_document_rag_with_ast_chunking(self, temp_code_dir):
|
|
"""Test document RAG with AST chunking enabled."""
|
|
with tempfile.TemporaryDirectory() as index_dir:
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--embedding-mode",
|
|
"sentence-transformers",
|
|
"--index-dir",
|
|
index_dir,
|
|
"--data-dir",
|
|
str(temp_code_dir),
|
|
"--enable-code-chunking",
|
|
"--query",
|
|
"How does the fibonacci function work?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300, # 5 minutes
|
|
env=env,
|
|
)
|
|
|
|
# Should succeed even if astchunk is not available (fallback)
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
output = result.stdout + result.stderr
|
|
assert "Index saved to" in output or "Using existing index" in output
|
|
|
|
except subprocess.TimeoutExpired:
|
|
pytest.skip("Test timed out - likely due to model download in CI")
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip integration tests in CI to avoid dependency issues",
|
|
)
|
|
def test_code_rag_application(self, temp_code_dir):
|
|
"""Test the specialized code RAG application."""
|
|
with tempfile.TemporaryDirectory() as index_dir:
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/code_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--index-dir",
|
|
index_dir,
|
|
"--repo-dir",
|
|
str(temp_code_dir),
|
|
"--query",
|
|
"What classes are defined in this code?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
|
|
|
|
# Should succeed
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
output = result.stdout + result.stderr
|
|
assert "Using AST-aware chunking" in output or "traditional chunking" in output
|
|
|
|
except subprocess.TimeoutExpired:
|
|
pytest.skip("Test timed out - likely due to model download in CI")
|
|
|
|
|
|
class TestASTContentExtraction:
|
|
"""Test AST content extraction bug fix.
|
|
|
|
These tests verify that astchunk's dict format with 'content' key is handled correctly,
|
|
and that the extraction logic doesn't fall through to stringifying entire dicts.
|
|
"""
|
|
|
|
def test_extract_content_from_astchunk_dict(self):
|
|
"""Test that astchunk dict format with 'content' key is handled correctly.
|
|
|
|
Bug: Current code checks for chunk["text"] but astchunk returns chunk["content"].
|
|
This causes fallthrough to str(chunk), stringifying the entire dict.
|
|
|
|
This test will FAIL until the bug is fixed because:
|
|
- Current code will stringify the dict: "{'content': '...', 'metadata': {...}}"
|
|
- Fixed code should extract just the content value
|
|
"""
|
|
# Mock the ASTChunkBuilder class
|
|
mock_builder = Mock()
|
|
|
|
# Astchunk returns this format
|
|
astchunk_format_chunk = {
|
|
"content": "def hello():\n print('world')",
|
|
"metadata": {
|
|
"filepath": "test.py",
|
|
"line_count": 2,
|
|
"start_line_no": 0,
|
|
"end_line_no": 1,
|
|
"node_count": 1,
|
|
},
|
|
}
|
|
mock_builder.chunkify.return_value = [astchunk_format_chunk]
|
|
|
|
# Create mock document
|
|
doc = MockDocument(
|
|
"def hello():\n print('world')", "/test/test.py", {"language": "python"}
|
|
)
|
|
|
|
# Mock the astchunk module and its ASTChunkBuilder class
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
# Patch sys.modules to inject our mock before the import
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
# Call create_ast_chunks
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# R3: Should return dict format with proper metadata
|
|
assert len(chunks) > 0, "Should return at least one chunk"
|
|
|
|
# R3: Each chunk should be a dict
|
|
chunk = chunks[0]
|
|
assert isinstance(chunk, dict), "Chunk should be a dict"
|
|
assert "text" in chunk, "Chunk should have 'text' key"
|
|
assert "metadata" in chunk, "Chunk should have 'metadata' key"
|
|
|
|
chunk_text = chunk["text"]
|
|
|
|
# CRITICAL: Should NOT contain stringified dict markers in the text field
|
|
# These assertions will FAIL with current buggy code
|
|
assert "'content':" not in chunk_text, (
|
|
f"Chunk text contains stringified dict - extraction failed! Got: {chunk_text[:100]}..."
|
|
)
|
|
assert "'metadata':" not in chunk_text, (
|
|
"Chunk text contains stringified metadata - extraction failed! "
|
|
f"Got: {chunk_text[:100]}..."
|
|
)
|
|
assert "{" not in chunk_text or "def hello" in chunk_text.split("{")[0], (
|
|
"Chunk text appears to be a stringified dict"
|
|
)
|
|
|
|
# Should contain actual content
|
|
assert "def hello()" in chunk_text, "Should extract actual code content"
|
|
assert "print('world')" in chunk_text, "Should extract complete code content"
|
|
|
|
# R3: Should preserve astchunk metadata
|
|
assert "filepath" in chunk["metadata"] or "file_path" in chunk["metadata"], (
|
|
"Should preserve file path metadata"
|
|
)
|
|
|
|
def test_extract_text_key_fallback(self):
|
|
"""Test that 'text' key still works for backward compatibility.
|
|
|
|
Some chunks might use 'text' instead of 'content' - ensure backward compatibility.
|
|
This test should PASS even with current code.
|
|
"""
|
|
mock_builder = Mock()
|
|
|
|
# Some chunks might use "text" key
|
|
text_key_chunk = {"text": "def legacy_function():\n return True"}
|
|
mock_builder.chunkify.return_value = [text_key_chunk]
|
|
|
|
# Create mock document
|
|
doc = MockDocument(
|
|
"def legacy_function():\n return True", "/test/legacy.py", {"language": "python"}
|
|
)
|
|
|
|
# Mock the astchunk module
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
# Call create_ast_chunks
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# R3: Should extract text correctly as dict format
|
|
assert len(chunks) > 0
|
|
chunk = chunks[0]
|
|
assert isinstance(chunk, dict), "Chunk should be a dict"
|
|
assert "text" in chunk, "Chunk should have 'text' key"
|
|
|
|
chunk_text = chunk["text"]
|
|
|
|
# Should NOT be stringified
|
|
assert "'text':" not in chunk_text, "Should not stringify dict with 'text' key"
|
|
|
|
# Should contain actual content
|
|
assert "def legacy_function()" in chunk_text
|
|
assert "return True" in chunk_text
|
|
|
|
def test_handles_string_chunks(self):
|
|
"""Test that plain string chunks still work.
|
|
|
|
Some chunkers might return plain strings - verify these are preserved.
|
|
This test should PASS with current code.
|
|
"""
|
|
mock_builder = Mock()
|
|
|
|
# Plain string chunk
|
|
plain_string_chunk = "def simple_function():\n pass"
|
|
mock_builder.chunkify.return_value = [plain_string_chunk]
|
|
|
|
# Create mock document
|
|
doc = MockDocument(
|
|
"def simple_function():\n pass", "/test/simple.py", {"language": "python"}
|
|
)
|
|
|
|
# Mock the astchunk module
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
# Call create_ast_chunks
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# R3: Should wrap string in dict format
|
|
assert len(chunks) > 0
|
|
chunk = chunks[0]
|
|
assert isinstance(chunk, dict), "Even string chunks should be wrapped in dict"
|
|
assert "text" in chunk, "Chunk should have 'text' key"
|
|
|
|
chunk_text = chunk["text"]
|
|
|
|
assert chunk_text == plain_string_chunk.strip(), (
|
|
"Should preserve plain string chunk content"
|
|
)
|
|
assert "def simple_function()" in chunk_text
|
|
assert "pass" in chunk_text
|
|
|
|
def test_multiple_chunks_with_mixed_formats(self):
|
|
"""Test handling of multiple chunks with different formats.
|
|
|
|
Real-world scenario: astchunk might return a mix of formats.
|
|
This test will FAIL if any chunk with 'content' key gets stringified.
|
|
"""
|
|
mock_builder = Mock()
|
|
|
|
# Mix of formats
|
|
mixed_chunks = [
|
|
{"content": "def first():\n return 1", "metadata": {"line_count": 2}},
|
|
"def second():\n return 2", # Plain string
|
|
{"text": "def third():\n return 3"}, # Old format
|
|
{"content": "class MyClass:\n pass", "metadata": {"node_count": 1}},
|
|
]
|
|
mock_builder.chunkify.return_value = mixed_chunks
|
|
|
|
# Create mock document
|
|
code = "def first():\n return 1\n\ndef second():\n return 2\n\ndef third():\n return 3\n\nclass MyClass:\n pass"
|
|
doc = MockDocument(code, "/test/mixed.py", {"language": "python"})
|
|
|
|
# Mock the astchunk module
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
# Call create_ast_chunks
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# R3: Should extract all chunks correctly as dicts
|
|
assert len(chunks) == 4, "Should extract all 4 chunks"
|
|
|
|
# Check each chunk
|
|
for i, chunk in enumerate(chunks):
|
|
assert isinstance(chunk, dict), f"Chunk {i} should be a dict"
|
|
assert "text" in chunk, f"Chunk {i} should have 'text' key"
|
|
assert "metadata" in chunk, f"Chunk {i} should have 'metadata' key"
|
|
|
|
chunk_text = chunk["text"]
|
|
# None should be stringified dicts
|
|
assert "'content':" not in chunk_text, f"Chunk {i} text is stringified (has 'content':)"
|
|
assert "'metadata':" not in chunk_text, (
|
|
f"Chunk {i} text is stringified (has 'metadata':)"
|
|
)
|
|
assert "'text':" not in chunk_text, f"Chunk {i} text is stringified (has 'text':)"
|
|
|
|
# Verify actual content is present
|
|
combined = "\n".join([c["text"] for c in chunks])
|
|
assert "def first()" in combined
|
|
assert "def second()" in combined
|
|
assert "def third()" in combined
|
|
assert "class MyClass:" in combined
|
|
|
|
def test_empty_content_value_handling(self):
|
|
"""Test handling of chunks with empty content values.
|
|
|
|
Edge case: chunk has 'content' key but value is empty.
|
|
Should skip these chunks, not stringify them.
|
|
"""
|
|
mock_builder = Mock()
|
|
|
|
chunks_with_empty = [
|
|
{"content": "", "metadata": {"line_count": 0}}, # Empty content
|
|
{"content": " ", "metadata": {"line_count": 1}}, # Whitespace only
|
|
{"content": "def valid():\n return True", "metadata": {"line_count": 2}}, # Valid
|
|
]
|
|
mock_builder.chunkify.return_value = chunks_with_empty
|
|
|
|
doc = MockDocument(
|
|
"def valid():\n return True", "/test/empty.py", {"language": "python"}
|
|
)
|
|
|
|
# Mock the astchunk module
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# R3: Should only have the valid chunk (empty ones filtered out)
|
|
assert len(chunks) == 1, "Should filter out empty content chunks"
|
|
|
|
chunk = chunks[0]
|
|
assert isinstance(chunk, dict), "Chunk should be a dict"
|
|
assert "text" in chunk, "Chunk should have 'text' key"
|
|
assert "def valid()" in chunk["text"]
|
|
|
|
# Should not have stringified the empty dict
|
|
assert "'content': ''" not in chunk["text"]
|
|
|
|
|
|
class TestASTMetadataPreservation:
|
|
"""Test metadata preservation in AST chunk dictionaries.
|
|
|
|
R3: These tests define the contract for metadata preservation when returning
|
|
chunk dictionaries instead of plain strings. Each chunk dict should have:
|
|
- "text": str - the actual chunk content
|
|
- "metadata": dict - all metadata from document AND astchunk
|
|
|
|
These tests will FAIL until G3 implementation changes return type to list[dict].
|
|
"""
|
|
|
|
def test_ast_chunks_preserve_file_metadata(self):
|
|
"""Test that document metadata is preserved in chunk metadata.
|
|
|
|
This test verifies that all document-level metadata (file_path, file_name,
|
|
creation_date, last_modified_date) is included in each chunk's metadata dict.
|
|
|
|
This will FAIL because current code returns list[str], not list[dict].
|
|
"""
|
|
# Create mock document with rich metadata
|
|
python_code = '''
|
|
def calculate_sum(numbers):
|
|
"""Calculate sum of numbers."""
|
|
return sum(numbers)
|
|
|
|
class DataProcessor:
|
|
"""Process data records."""
|
|
|
|
def process(self, data):
|
|
return [x * 2 for x in data]
|
|
'''
|
|
doc = MockDocument(
|
|
python_code,
|
|
file_path="/project/src/utils.py",
|
|
metadata={
|
|
"language": "python",
|
|
"file_path": "/project/src/utils.py",
|
|
"file_name": "utils.py",
|
|
"creation_date": "2024-01-15T10:30:00",
|
|
"last_modified_date": "2024-10-31T15:45:00",
|
|
},
|
|
)
|
|
|
|
# Mock astchunk to return chunks with metadata
|
|
mock_builder = Mock()
|
|
astchunk_chunks = [
|
|
{
|
|
"content": "def calculate_sum(numbers):\n return sum(numbers)",
|
|
"metadata": {
|
|
"filepath": "/project/src/utils.py",
|
|
"line_count": 2,
|
|
"start_line_no": 1,
|
|
"end_line_no": 2,
|
|
"node_count": 1,
|
|
},
|
|
},
|
|
{
|
|
"content": "class DataProcessor:\n def process(self, data):\n return [x * 2 for x in data]",
|
|
"metadata": {
|
|
"filepath": "/project/src/utils.py",
|
|
"line_count": 3,
|
|
"start_line_no": 5,
|
|
"end_line_no": 7,
|
|
"node_count": 2,
|
|
},
|
|
},
|
|
]
|
|
mock_builder.chunkify.return_value = astchunk_chunks
|
|
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# CRITICAL: These assertions will FAIL with current list[str] return type
|
|
assert len(chunks) == 2, "Should return 2 chunks"
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
# Structure assertions - WILL FAIL: current code returns strings
|
|
assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
|
|
assert "text" in chunk, f"Chunk {i} must have 'text' key"
|
|
assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
|
|
assert isinstance(chunk["metadata"], dict), f"Chunk {i} metadata should be dict"
|
|
|
|
# Document metadata preservation - WILL FAIL
|
|
metadata = chunk["metadata"]
|
|
assert "file_path" in metadata, f"Chunk {i} should preserve file_path"
|
|
assert metadata["file_path"] == "/project/src/utils.py", (
|
|
f"Chunk {i} file_path incorrect"
|
|
)
|
|
|
|
assert "file_name" in metadata, f"Chunk {i} should preserve file_name"
|
|
assert metadata["file_name"] == "utils.py", f"Chunk {i} file_name incorrect"
|
|
|
|
assert "creation_date" in metadata, f"Chunk {i} should preserve creation_date"
|
|
assert metadata["creation_date"] == "2024-01-15T10:30:00", (
|
|
f"Chunk {i} creation_date incorrect"
|
|
)
|
|
|
|
assert "last_modified_date" in metadata, f"Chunk {i} should preserve last_modified_date"
|
|
assert metadata["last_modified_date"] == "2024-10-31T15:45:00", (
|
|
f"Chunk {i} last_modified_date incorrect"
|
|
)
|
|
|
|
# Verify metadata is consistent across chunks from same document
|
|
assert chunks[0]["metadata"]["file_path"] == chunks[1]["metadata"]["file_path"], (
|
|
"All chunks from same document should have same file_path"
|
|
)
|
|
|
|
# Verify text content is present and not stringified
|
|
assert "def calculate_sum" in chunks[0]["text"]
|
|
assert "class DataProcessor" in chunks[1]["text"]
|
|
|
|
def test_ast_chunks_include_astchunk_metadata(self):
|
|
"""Test that astchunk-specific metadata is merged into chunk metadata.
|
|
|
|
This test verifies that astchunk's metadata (line_count, start_line_no,
|
|
end_line_no, node_count) is merged with document metadata.
|
|
|
|
This will FAIL because current code returns list[str], not list[dict].
|
|
"""
|
|
python_code = '''
|
|
def function_one():
|
|
"""First function."""
|
|
x = 1
|
|
y = 2
|
|
return x + y
|
|
|
|
def function_two():
|
|
"""Second function."""
|
|
return 42
|
|
'''
|
|
doc = MockDocument(
|
|
python_code,
|
|
file_path="/test/code.py",
|
|
metadata={
|
|
"language": "python",
|
|
"file_path": "/test/code.py",
|
|
"file_name": "code.py",
|
|
},
|
|
)
|
|
|
|
# Mock astchunk with detailed metadata
|
|
mock_builder = Mock()
|
|
astchunk_chunks = [
|
|
{
|
|
"content": "def function_one():\n x = 1\n y = 2\n return x + y",
|
|
"metadata": {
|
|
"filepath": "/test/code.py",
|
|
"line_count": 4,
|
|
"start_line_no": 1,
|
|
"end_line_no": 4,
|
|
"node_count": 5, # function, assignments, return
|
|
},
|
|
},
|
|
{
|
|
"content": "def function_two():\n return 42",
|
|
"metadata": {
|
|
"filepath": "/test/code.py",
|
|
"line_count": 2,
|
|
"start_line_no": 7,
|
|
"end_line_no": 8,
|
|
"node_count": 2, # function, return
|
|
},
|
|
},
|
|
]
|
|
mock_builder.chunkify.return_value = astchunk_chunks
|
|
|
|
mock_astchunk = Mock()
|
|
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
|
|
|
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
|
chunks = create_ast_chunks([doc])
|
|
|
|
# CRITICAL: These will FAIL with current list[str] return
|
|
assert len(chunks) == 2
|
|
|
|
# First chunk - function_one
|
|
chunk1 = chunks[0]
|
|
assert isinstance(chunk1, dict), "Chunk should be dict"
|
|
assert "metadata" in chunk1
|
|
|
|
metadata1 = chunk1["metadata"]
|
|
|
|
# Check astchunk metadata is present
|
|
assert "line_count" in metadata1, "Should include astchunk line_count"
|
|
assert metadata1["line_count"] == 4, "line_count should be 4"
|
|
|
|
assert "start_line_no" in metadata1, "Should include astchunk start_line_no"
|
|
assert metadata1["start_line_no"] == 1, "start_line_no should be 1"
|
|
|
|
assert "end_line_no" in metadata1, "Should include astchunk end_line_no"
|
|
assert metadata1["end_line_no"] == 4, "end_line_no should be 4"
|
|
|
|
assert "node_count" in metadata1, "Should include astchunk node_count"
|
|
assert metadata1["node_count"] == 5, "node_count should be 5"
|
|
|
|
# Second chunk - function_two
|
|
chunk2 = chunks[1]
|
|
metadata2 = chunk2["metadata"]
|
|
|
|
assert metadata2["line_count"] == 2, "line_count should be 2"
|
|
assert metadata2["start_line_no"] == 7, "start_line_no should be 7"
|
|
assert metadata2["end_line_no"] == 8, "end_line_no should be 8"
|
|
assert metadata2["node_count"] == 2, "node_count should be 2"
|
|
|
|
# Verify document metadata is ALSO present (merged, not replaced)
|
|
assert metadata1["file_path"] == "/test/code.py"
|
|
assert metadata1["file_name"] == "code.py"
|
|
assert metadata2["file_path"] == "/test/code.py"
|
|
assert metadata2["file_name"] == "code.py"
|
|
|
|
# Verify text content is correct
|
|
assert "def function_one" in chunk1["text"]
|
|
assert "def function_two" in chunk2["text"]
|
|
|
|
def test_traditional_chunks_as_dicts_helper(self):
|
|
"""Test the helper function that wraps traditional chunks as dicts.
|
|
|
|
This test verifies that when create_traditional_chunks is called,
|
|
its plain string chunks are wrapped into dict format with metadata.
|
|
|
|
This will FAIL because the helper function _traditional_chunks_as_dicts()
|
|
doesn't exist yet, and create_traditional_chunks returns list[str].
|
|
"""
|
|
# Create documents with various metadata
|
|
docs = [
|
|
MockDocument(
|
|
"This is the first paragraph of text. It contains multiple sentences. "
|
|
"This should be split into chunks based on size.",
|
|
file_path="/docs/readme.txt",
|
|
metadata={
|
|
"file_path": "/docs/readme.txt",
|
|
"file_name": "readme.txt",
|
|
"creation_date": "2024-01-01",
|
|
},
|
|
),
|
|
MockDocument(
|
|
"Second document with different metadata. It also has content that needs chunking.",
|
|
file_path="/docs/guide.md",
|
|
metadata={
|
|
"file_path": "/docs/guide.md",
|
|
"file_name": "guide.md",
|
|
"last_modified_date": "2024-10-31",
|
|
},
|
|
),
|
|
]
|
|
|
|
# Call create_traditional_chunks (which should now return list[dict])
|
|
chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
|
|
|
|
# CRITICAL: Will FAIL - current code returns list[str]
|
|
assert len(chunks) > 0, "Should return chunks"
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
# Structure assertions - WILL FAIL
|
|
assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
|
|
assert "text" in chunk, f"Chunk {i} must have 'text' key"
|
|
assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
|
|
|
|
# Text should be non-empty
|
|
assert len(chunk["text"].strip()) > 0, f"Chunk {i} text should be non-empty"
|
|
|
|
# Metadata should include document info
|
|
metadata = chunk["metadata"]
|
|
assert "file_path" in metadata, f"Chunk {i} should have file_path in metadata"
|
|
assert "file_name" in metadata, f"Chunk {i} should have file_name in metadata"
|
|
|
|
# Verify metadata tracking works correctly
|
|
# At least one chunk should be from readme.txt
|
|
readme_chunks = [c for c in chunks if "readme.txt" in c["metadata"]["file_name"]]
|
|
assert len(readme_chunks) > 0, "Should have chunks from readme.txt"
|
|
|
|
# At least one chunk should be from guide.md
|
|
guide_chunks = [c for c in chunks if "guide.md" in c["metadata"]["file_name"]]
|
|
assert len(guide_chunks) > 0, "Should have chunks from guide.md"
|
|
|
|
# Verify creation_date is preserved for readme chunks
|
|
for chunk in readme_chunks:
|
|
assert chunk["metadata"].get("creation_date") == "2024-01-01", (
|
|
"readme.txt chunks should preserve creation_date"
|
|
)
|
|
|
|
# Verify last_modified_date is preserved for guide chunks
|
|
for chunk in guide_chunks:
|
|
assert chunk["metadata"].get("last_modified_date") == "2024-10-31", (
|
|
"guide.md chunks should preserve last_modified_date"
|
|
)
|
|
|
|
# Verify text content is present
|
|
all_text = " ".join([c["text"] for c in chunks])
|
|
assert "first paragraph" in all_text
|
|
assert "Second document" in all_text
|
|
|
|
|
|
class TestErrorHandling:
|
|
"""Test error handling and edge cases."""
|
|
|
|
def test_text_chunking_empty_documents(self):
|
|
"""Test text chunking with empty document list."""
|
|
chunks = create_text_chunks([])
|
|
assert chunks == []
|
|
|
|
def test_text_chunking_invalid_parameters(self):
|
|
"""Test text chunking with invalid parameters."""
|
|
docs = [MockDocument("test content")]
|
|
|
|
# Should handle negative chunk sizes gracefully
|
|
chunks = create_text_chunks(
|
|
docs, chunk_size=0, chunk_overlap=0, ast_chunk_size=0, ast_chunk_overlap=0
|
|
)
|
|
|
|
# Should still return some result
|
|
assert isinstance(chunks, list)
|
|
|
|
def test_create_ast_chunks_no_language(self):
|
|
"""Test AST chunking with documents missing language metadata."""
|
|
docs = [MockDocument("def test(): pass", "/test/script.py")] # No language set
|
|
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should fall back to traditional chunking
|
|
assert isinstance(chunks, list)
|
|
assert len(chunks) >= 0 # May be empty if fallback also fails
|
|
|
|
def test_create_ast_chunks_empty_content(self):
|
|
"""Test AST chunking with empty content."""
|
|
docs = [MockDocument("", "/test/script.py", {"language": "python"})]
|
|
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should handle empty content gracefully
|
|
assert isinstance(chunks, list)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|