""" Test suite for astchunk integration with LEANN. Tests AST-aware chunking functionality, language detection, and fallback mechanisms. """ import os import subprocess import sys import tempfile from pathlib import Path from unittest.mock import Mock, patch import pytest # Add apps directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "apps")) from typing import Optional from chunking import ( create_ast_chunks, create_text_chunks, create_traditional_chunks, detect_code_files, get_language_from_extension, ) class MockDocument: """Mock LlamaIndex Document for testing.""" def __init__(self, content: str, file_path: str = "", metadata: Optional[dict] = None): self.content = content self.metadata = metadata or {} if file_path: self.metadata["file_path"] = file_path def get_content(self) -> str: return self.content class TestCodeFileDetection: """Test code file detection and language mapping.""" def test_detect_code_files_python(self): """Test detection of Python files.""" docs = [ MockDocument("print('hello')", "/path/to/file.py"), MockDocument("This is text", "/path/to/file.txt"), ] code_docs, text_docs = detect_code_files(docs) assert len(code_docs) == 1 assert len(text_docs) == 1 assert code_docs[0].metadata["language"] == "python" assert code_docs[0].metadata["is_code"] is True assert text_docs[0].metadata["is_code"] is False def test_detect_code_files_multiple_languages(self): """Test detection of multiple programming languages.""" docs = [ MockDocument("def func():", "/path/to/script.py"), MockDocument("public class Test {}", "/path/to/Test.java"), MockDocument("interface ITest {}", "/path/to/test.ts"), MockDocument("using System;", "/path/to/Program.cs"), MockDocument("Regular text content", "/path/to/document.txt"), ] code_docs, text_docs = detect_code_files(docs) assert len(code_docs) == 4 assert len(text_docs) == 1 languages = [doc.metadata["language"] for doc in code_docs] assert "python" in languages assert "java" in languages assert "typescript" in languages assert "csharp" in languages def test_detect_code_files_no_file_path(self): """Test handling of documents without file paths.""" docs = [ MockDocument("some content"), MockDocument("other content", metadata={"some_key": "value"}), ] code_docs, text_docs = detect_code_files(docs) assert len(code_docs) == 0 assert len(text_docs) == 2 for doc in text_docs: assert doc.metadata["is_code"] is False def test_get_language_from_extension(self): """Test language detection from file extensions.""" assert get_language_from_extension("test.py") == "python" assert get_language_from_extension("Test.java") == "java" assert get_language_from_extension("component.tsx") == "typescript" assert get_language_from_extension("Program.cs") == "csharp" assert get_language_from_extension("document.txt") is None assert get_language_from_extension("") is None class TestChunkingFunctions: """Test various chunking functionality.""" def test_create_traditional_chunks(self): """Test traditional text chunking.""" docs = [ MockDocument( "This is a test document. It has multiple sentences. We want to test chunking." ) ] chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10) assert len(chunks) > 0 # Traditional chunks now return dict format for consistency assert all(isinstance(chunk, dict) for chunk in chunks) assert all("text" in chunk and "metadata" in chunk for chunk in chunks) assert all(len(chunk["text"].strip()) > 0 for chunk in chunks) def test_create_traditional_chunks_empty_docs(self): """Test traditional chunking with empty documents.""" chunks = create_traditional_chunks([], chunk_size=50, chunk_overlap=10) assert chunks == [] @pytest.mark.skipif( os.environ.get("CI") == "true", reason="Skip astchunk tests in CI - dependency may not be available", ) def test_create_ast_chunks_with_astchunk_available(self): """Test AST chunking when astchunk is available.""" python_code = ''' def hello_world(): """Print hello world message.""" print("Hello, World!") def add_numbers(a, b): """Add two numbers and return the result.""" return a + b class Calculator: """A simple calculator class.""" def __init__(self): self.history = [] def add(self, a, b): result = a + b self.history.append(f"{a} + {b} = {result}") return result ''' docs = [MockDocument(python_code, "/test/calculator.py", {"language": "python"})] try: chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50) # Should have multiple chunks due to different functions/classes assert len(chunks) > 0 # R3: Expect dict format with "text" and "metadata" keys assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts" assert all("text" in chunk and "metadata" in chunk for chunk in chunks), ( "Each chunk should have 'text' and 'metadata' keys" ) assert all(len(chunk["text"].strip()) > 0 for chunk in chunks), ( "Each chunk text should be non-empty" ) # Check metadata is present assert all("file_path" in chunk["metadata"] for chunk in chunks), ( "Each chunk should have file_path metadata" ) # Check that code structure is somewhat preserved combined_content = " ".join([c["text"] for c in chunks]) assert "def hello_world" in combined_content assert "class Calculator" in combined_content except ImportError: # astchunk not available, should fall back to traditional chunking chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50) assert len(chunks) > 0 # Should still get chunks from fallback def test_create_ast_chunks_fallback_to_traditional(self): """Test AST chunking falls back to traditional when astchunk is not available.""" docs = [MockDocument("def test(): pass", "/test/script.py", {"language": "python"})] # Mock astchunk import to fail with patch("chunking.create_ast_chunks"): # First call (actual test) should import astchunk and potentially fail # Let's call the actual function to test the import error handling chunks = create_ast_chunks(docs) # Should return some chunks (either from astchunk or fallback) assert isinstance(chunks, list) def test_create_text_chunks_traditional_mode(self): """Test text chunking in traditional mode.""" docs = [ MockDocument("def test(): pass", "/test/script.py"), MockDocument("This is regular text.", "/test/doc.txt"), ] chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10) assert len(chunks) > 0 # R3: Traditional chunking should also return dict format for consistency assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts" assert all("text" in chunk and "metadata" in chunk for chunk in chunks), ( "Each chunk should have 'text' and 'metadata' keys" ) def test_create_text_chunks_ast_mode(self): """Test text chunking in AST mode.""" docs = [ MockDocument("def test(): pass", "/test/script.py"), MockDocument("This is regular text.", "/test/doc.txt"), ] chunks = create_text_chunks( docs, use_ast_chunking=True, ast_chunk_size=100, ast_chunk_overlap=20, chunk_size=50, chunk_overlap=10, ) assert len(chunks) > 0 # R3: AST mode should also return dict format assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts" assert all("text" in chunk and "metadata" in chunk for chunk in chunks), ( "Each chunk should have 'text' and 'metadata' keys" ) def test_create_text_chunks_custom_extensions(self): """Test text chunking with custom code file extensions.""" docs = [ MockDocument("function test() {}", "/test/script.js"), # Not in default extensions MockDocument("Regular text", "/test/doc.txt"), ] # First without custom extensions - should treat .js as text chunks_without = create_text_chunks(docs, use_ast_chunking=True, code_file_extensions=None) # Then with custom extensions - should treat .js as code chunks_with = create_text_chunks( docs, use_ast_chunking=True, code_file_extensions=[".js", ".jsx"] ) # Both should return chunks assert len(chunks_without) > 0 assert len(chunks_with) > 0 class TestIntegrationWithDocumentRAG: """Integration tests with the document RAG system.""" @pytest.fixture def temp_code_dir(self): """Create a temporary directory with sample code files.""" with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Create sample Python file python_file = temp_path / "example.py" python_file.write_text(''' def fibonacci(n): """Calculate fibonacci number.""" if n <= 1: return n return fibonacci(n-1) + fibonacci(n-2) class MathUtils: @staticmethod def factorial(n): if n <= 1: return 1 return n * MathUtils.factorial(n-1) ''') # Create sample text file text_file = temp_path / "readme.txt" text_file.write_text("This is a sample text file for testing purposes.") yield temp_path @pytest.mark.skipif( os.environ.get("CI") == "true", reason="Skip integration tests in CI to avoid dependency issues", ) def test_document_rag_with_ast_chunking(self, temp_code_dir): """Test document RAG with AST chunking enabled.""" with tempfile.TemporaryDirectory() as index_dir: cmd = [ sys.executable, "apps/document_rag.py", "--llm", "simulated", "--embedding-model", "facebook/contriever", "--embedding-mode", "sentence-transformers", "--index-dir", index_dir, "--data-dir", str(temp_code_dir), "--enable-code-chunking", "--query", "How does the fibonacci function work?", ] env = os.environ.copy() env["HF_HUB_DISABLE_SYMLINKS"] = "1" env["TOKENIZERS_PARALLELISM"] = "false" try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=300, # 5 minutes env=env, ) # Should succeed even if astchunk is not available (fallback) assert result.returncode == 0, f"Command failed: {result.stderr}" output = result.stdout + result.stderr assert "Index saved to" in output or "Using existing index" in output except subprocess.TimeoutExpired: pytest.skip("Test timed out - likely due to model download in CI") @pytest.mark.skipif( os.environ.get("CI") == "true", reason="Skip integration tests in CI to avoid dependency issues", ) def test_code_rag_application(self, temp_code_dir): """Test the specialized code RAG application.""" with tempfile.TemporaryDirectory() as index_dir: cmd = [ sys.executable, "apps/code_rag.py", "--llm", "simulated", "--embedding-model", "facebook/contriever", "--index-dir", index_dir, "--repo-dir", str(temp_code_dir), "--query", "What classes are defined in this code?", ] env = os.environ.copy() env["HF_HUB_DISABLE_SYMLINKS"] = "1" env["TOKENIZERS_PARALLELISM"] = "false" try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env) # Should succeed assert result.returncode == 0, f"Command failed: {result.stderr}" output = result.stdout + result.stderr assert "Using AST-aware chunking" in output or "traditional chunking" in output except subprocess.TimeoutExpired: pytest.skip("Test timed out - likely due to model download in CI") class TestASTContentExtraction: """Test AST content extraction bug fix. These tests verify that astchunk's dict format with 'content' key is handled correctly, and that the extraction logic doesn't fall through to stringifying entire dicts. """ def test_extract_content_from_astchunk_dict(self): """Test that astchunk dict format with 'content' key is handled correctly. Bug: Current code checks for chunk["text"] but astchunk returns chunk["content"]. This causes fallthrough to str(chunk), stringifying the entire dict. This test will FAIL until the bug is fixed because: - Current code will stringify the dict: "{'content': '...', 'metadata': {...}}" - Fixed code should extract just the content value """ # Mock the ASTChunkBuilder class mock_builder = Mock() # Astchunk returns this format astchunk_format_chunk = { "content": "def hello():\n print('world')", "metadata": { "filepath": "test.py", "line_count": 2, "start_line_no": 0, "end_line_no": 1, "node_count": 1, }, } mock_builder.chunkify.return_value = [astchunk_format_chunk] # Create mock document doc = MockDocument( "def hello():\n print('world')", "/test/test.py", {"language": "python"} ) # Mock the astchunk module and its ASTChunkBuilder class mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) # Patch sys.modules to inject our mock before the import with patch.dict("sys.modules", {"astchunk": mock_astchunk}): # Call create_ast_chunks chunks = create_ast_chunks([doc]) # R3: Should return dict format with proper metadata assert len(chunks) > 0, "Should return at least one chunk" # R3: Each chunk should be a dict chunk = chunks[0] assert isinstance(chunk, dict), "Chunk should be a dict" assert "text" in chunk, "Chunk should have 'text' key" assert "metadata" in chunk, "Chunk should have 'metadata' key" chunk_text = chunk["text"] # CRITICAL: Should NOT contain stringified dict markers in the text field # These assertions will FAIL with current buggy code assert "'content':" not in chunk_text, ( f"Chunk text contains stringified dict - extraction failed! Got: {chunk_text[:100]}..." ) assert "'metadata':" not in chunk_text, ( "Chunk text contains stringified metadata - extraction failed! " f"Got: {chunk_text[:100]}..." ) assert "{" not in chunk_text or "def hello" in chunk_text.split("{")[0], ( "Chunk text appears to be a stringified dict" ) # Should contain actual content assert "def hello()" in chunk_text, "Should extract actual code content" assert "print('world')" in chunk_text, "Should extract complete code content" # R3: Should preserve astchunk metadata assert "filepath" in chunk["metadata"] or "file_path" in chunk["metadata"], ( "Should preserve file path metadata" ) def test_extract_text_key_fallback(self): """Test that 'text' key still works for backward compatibility. Some chunks might use 'text' instead of 'content' - ensure backward compatibility. This test should PASS even with current code. """ mock_builder = Mock() # Some chunks might use "text" key text_key_chunk = {"text": "def legacy_function():\n return True"} mock_builder.chunkify.return_value = [text_key_chunk] # Create mock document doc = MockDocument( "def legacy_function():\n return True", "/test/legacy.py", {"language": "python"} ) # Mock the astchunk module mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): # Call create_ast_chunks chunks = create_ast_chunks([doc]) # R3: Should extract text correctly as dict format assert len(chunks) > 0 chunk = chunks[0] assert isinstance(chunk, dict), "Chunk should be a dict" assert "text" in chunk, "Chunk should have 'text' key" chunk_text = chunk["text"] # Should NOT be stringified assert "'text':" not in chunk_text, "Should not stringify dict with 'text' key" # Should contain actual content assert "def legacy_function()" in chunk_text assert "return True" in chunk_text def test_handles_string_chunks(self): """Test that plain string chunks still work. Some chunkers might return plain strings - verify these are preserved. This test should PASS with current code. """ mock_builder = Mock() # Plain string chunk plain_string_chunk = "def simple_function():\n pass" mock_builder.chunkify.return_value = [plain_string_chunk] # Create mock document doc = MockDocument( "def simple_function():\n pass", "/test/simple.py", {"language": "python"} ) # Mock the astchunk module mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): # Call create_ast_chunks chunks = create_ast_chunks([doc]) # R3: Should wrap string in dict format assert len(chunks) > 0 chunk = chunks[0] assert isinstance(chunk, dict), "Even string chunks should be wrapped in dict" assert "text" in chunk, "Chunk should have 'text' key" chunk_text = chunk["text"] assert chunk_text == plain_string_chunk.strip(), ( "Should preserve plain string chunk content" ) assert "def simple_function()" in chunk_text assert "pass" in chunk_text def test_multiple_chunks_with_mixed_formats(self): """Test handling of multiple chunks with different formats. Real-world scenario: astchunk might return a mix of formats. This test will FAIL if any chunk with 'content' key gets stringified. """ mock_builder = Mock() # Mix of formats mixed_chunks = [ {"content": "def first():\n return 1", "metadata": {"line_count": 2}}, "def second():\n return 2", # Plain string {"text": "def third():\n return 3"}, # Old format {"content": "class MyClass:\n pass", "metadata": {"node_count": 1}}, ] mock_builder.chunkify.return_value = mixed_chunks # Create mock document code = "def first():\n return 1\n\ndef second():\n return 2\n\ndef third():\n return 3\n\nclass MyClass:\n pass" doc = MockDocument(code, "/test/mixed.py", {"language": "python"}) # Mock the astchunk module mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): # Call create_ast_chunks chunks = create_ast_chunks([doc]) # R3: Should extract all chunks correctly as dicts assert len(chunks) == 4, "Should extract all 4 chunks" # Check each chunk for i, chunk in enumerate(chunks): assert isinstance(chunk, dict), f"Chunk {i} should be a dict" assert "text" in chunk, f"Chunk {i} should have 'text' key" assert "metadata" in chunk, f"Chunk {i} should have 'metadata' key" chunk_text = chunk["text"] # None should be stringified dicts assert "'content':" not in chunk_text, f"Chunk {i} text is stringified (has 'content':)" assert "'metadata':" not in chunk_text, ( f"Chunk {i} text is stringified (has 'metadata':)" ) assert "'text':" not in chunk_text, f"Chunk {i} text is stringified (has 'text':)" # Verify actual content is present combined = "\n".join([c["text"] for c in chunks]) assert "def first()" in combined assert "def second()" in combined assert "def third()" in combined assert "class MyClass:" in combined def test_empty_content_value_handling(self): """Test handling of chunks with empty content values. Edge case: chunk has 'content' key but value is empty. Should skip these chunks, not stringify them. """ mock_builder = Mock() chunks_with_empty = [ {"content": "", "metadata": {"line_count": 0}}, # Empty content {"content": " ", "metadata": {"line_count": 1}}, # Whitespace only {"content": "def valid():\n return True", "metadata": {"line_count": 2}}, # Valid ] mock_builder.chunkify.return_value = chunks_with_empty doc = MockDocument( "def valid():\n return True", "/test/empty.py", {"language": "python"} ) # Mock the astchunk module mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): chunks = create_ast_chunks([doc]) # R3: Should only have the valid chunk (empty ones filtered out) assert len(chunks) == 1, "Should filter out empty content chunks" chunk = chunks[0] assert isinstance(chunk, dict), "Chunk should be a dict" assert "text" in chunk, "Chunk should have 'text' key" assert "def valid()" in chunk["text"] # Should not have stringified the empty dict assert "'content': ''" not in chunk["text"] class TestASTMetadataPreservation: """Test metadata preservation in AST chunk dictionaries. R3: These tests define the contract for metadata preservation when returning chunk dictionaries instead of plain strings. Each chunk dict should have: - "text": str - the actual chunk content - "metadata": dict - all metadata from document AND astchunk These tests will FAIL until G3 implementation changes return type to list[dict]. """ def test_ast_chunks_preserve_file_metadata(self): """Test that document metadata is preserved in chunk metadata. This test verifies that all document-level metadata (file_path, file_name, creation_date, last_modified_date) is included in each chunk's metadata dict. This will FAIL because current code returns list[str], not list[dict]. """ # Create mock document with rich metadata python_code = ''' def calculate_sum(numbers): """Calculate sum of numbers.""" return sum(numbers) class DataProcessor: """Process data records.""" def process(self, data): return [x * 2 for x in data] ''' doc = MockDocument( python_code, file_path="/project/src/utils.py", metadata={ "language": "python", "file_path": "/project/src/utils.py", "file_name": "utils.py", "creation_date": "2024-01-15T10:30:00", "last_modified_date": "2024-10-31T15:45:00", }, ) # Mock astchunk to return chunks with metadata mock_builder = Mock() astchunk_chunks = [ { "content": "def calculate_sum(numbers):\n return sum(numbers)", "metadata": { "filepath": "/project/src/utils.py", "line_count": 2, "start_line_no": 1, "end_line_no": 2, "node_count": 1, }, }, { "content": "class DataProcessor:\n def process(self, data):\n return [x * 2 for x in data]", "metadata": { "filepath": "/project/src/utils.py", "line_count": 3, "start_line_no": 5, "end_line_no": 7, "node_count": 2, }, }, ] mock_builder.chunkify.return_value = astchunk_chunks mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): chunks = create_ast_chunks([doc]) # CRITICAL: These assertions will FAIL with current list[str] return type assert len(chunks) == 2, "Should return 2 chunks" for i, chunk in enumerate(chunks): # Structure assertions - WILL FAIL: current code returns strings assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}" assert "text" in chunk, f"Chunk {i} must have 'text' key" assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key" assert isinstance(chunk["metadata"], dict), f"Chunk {i} metadata should be dict" # Document metadata preservation - WILL FAIL metadata = chunk["metadata"] assert "file_path" in metadata, f"Chunk {i} should preserve file_path" assert metadata["file_path"] == "/project/src/utils.py", ( f"Chunk {i} file_path incorrect" ) assert "file_name" in metadata, f"Chunk {i} should preserve file_name" assert metadata["file_name"] == "utils.py", f"Chunk {i} file_name incorrect" assert "creation_date" in metadata, f"Chunk {i} should preserve creation_date" assert metadata["creation_date"] == "2024-01-15T10:30:00", ( f"Chunk {i} creation_date incorrect" ) assert "last_modified_date" in metadata, f"Chunk {i} should preserve last_modified_date" assert metadata["last_modified_date"] == "2024-10-31T15:45:00", ( f"Chunk {i} last_modified_date incorrect" ) # Verify metadata is consistent across chunks from same document assert chunks[0]["metadata"]["file_path"] == chunks[1]["metadata"]["file_path"], ( "All chunks from same document should have same file_path" ) # Verify text content is present and not stringified assert "def calculate_sum" in chunks[0]["text"] assert "class DataProcessor" in chunks[1]["text"] def test_ast_chunks_include_astchunk_metadata(self): """Test that astchunk-specific metadata is merged into chunk metadata. This test verifies that astchunk's metadata (line_count, start_line_no, end_line_no, node_count) is merged with document metadata. This will FAIL because current code returns list[str], not list[dict]. """ python_code = ''' def function_one(): """First function.""" x = 1 y = 2 return x + y def function_two(): """Second function.""" return 42 ''' doc = MockDocument( python_code, file_path="/test/code.py", metadata={ "language": "python", "file_path": "/test/code.py", "file_name": "code.py", }, ) # Mock astchunk with detailed metadata mock_builder = Mock() astchunk_chunks = [ { "content": "def function_one():\n x = 1\n y = 2\n return x + y", "metadata": { "filepath": "/test/code.py", "line_count": 4, "start_line_no": 1, "end_line_no": 4, "node_count": 5, # function, assignments, return }, }, { "content": "def function_two():\n return 42", "metadata": { "filepath": "/test/code.py", "line_count": 2, "start_line_no": 7, "end_line_no": 8, "node_count": 2, # function, return }, }, ] mock_builder.chunkify.return_value = astchunk_chunks mock_astchunk = Mock() mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder) with patch.dict("sys.modules", {"astchunk": mock_astchunk}): chunks = create_ast_chunks([doc]) # CRITICAL: These will FAIL with current list[str] return assert len(chunks) == 2 # First chunk - function_one chunk1 = chunks[0] assert isinstance(chunk1, dict), "Chunk should be dict" assert "metadata" in chunk1 metadata1 = chunk1["metadata"] # Check astchunk metadata is present assert "line_count" in metadata1, "Should include astchunk line_count" assert metadata1["line_count"] == 4, "line_count should be 4" assert "start_line_no" in metadata1, "Should include astchunk start_line_no" assert metadata1["start_line_no"] == 1, "start_line_no should be 1" assert "end_line_no" in metadata1, "Should include astchunk end_line_no" assert metadata1["end_line_no"] == 4, "end_line_no should be 4" assert "node_count" in metadata1, "Should include astchunk node_count" assert metadata1["node_count"] == 5, "node_count should be 5" # Second chunk - function_two chunk2 = chunks[1] metadata2 = chunk2["metadata"] assert metadata2["line_count"] == 2, "line_count should be 2" assert metadata2["start_line_no"] == 7, "start_line_no should be 7" assert metadata2["end_line_no"] == 8, "end_line_no should be 8" assert metadata2["node_count"] == 2, "node_count should be 2" # Verify document metadata is ALSO present (merged, not replaced) assert metadata1["file_path"] == "/test/code.py" assert metadata1["file_name"] == "code.py" assert metadata2["file_path"] == "/test/code.py" assert metadata2["file_name"] == "code.py" # Verify text content is correct assert "def function_one" in chunk1["text"] assert "def function_two" in chunk2["text"] def test_traditional_chunks_as_dicts_helper(self): """Test the helper function that wraps traditional chunks as dicts. This test verifies that when create_traditional_chunks is called, its plain string chunks are wrapped into dict format with metadata. This will FAIL because the helper function _traditional_chunks_as_dicts() doesn't exist yet, and create_traditional_chunks returns list[str]. """ # Create documents with various metadata docs = [ MockDocument( "This is the first paragraph of text. It contains multiple sentences. " "This should be split into chunks based on size.", file_path="/docs/readme.txt", metadata={ "file_path": "/docs/readme.txt", "file_name": "readme.txt", "creation_date": "2024-01-01", }, ), MockDocument( "Second document with different metadata. It also has content that needs chunking.", file_path="/docs/guide.md", metadata={ "file_path": "/docs/guide.md", "file_name": "guide.md", "last_modified_date": "2024-10-31", }, ), ] # Call create_traditional_chunks (which should now return list[dict]) chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10) # CRITICAL: Will FAIL - current code returns list[str] assert len(chunks) > 0, "Should return chunks" for i, chunk in enumerate(chunks): # Structure assertions - WILL FAIL assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}" assert "text" in chunk, f"Chunk {i} must have 'text' key" assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key" # Text should be non-empty assert len(chunk["text"].strip()) > 0, f"Chunk {i} text should be non-empty" # Metadata should include document info metadata = chunk["metadata"] assert "file_path" in metadata, f"Chunk {i} should have file_path in metadata" assert "file_name" in metadata, f"Chunk {i} should have file_name in metadata" # Verify metadata tracking works correctly # At least one chunk should be from readme.txt readme_chunks = [c for c in chunks if "readme.txt" in c["metadata"]["file_name"]] assert len(readme_chunks) > 0, "Should have chunks from readme.txt" # At least one chunk should be from guide.md guide_chunks = [c for c in chunks if "guide.md" in c["metadata"]["file_name"]] assert len(guide_chunks) > 0, "Should have chunks from guide.md" # Verify creation_date is preserved for readme chunks for chunk in readme_chunks: assert chunk["metadata"].get("creation_date") == "2024-01-01", ( "readme.txt chunks should preserve creation_date" ) # Verify last_modified_date is preserved for guide chunks for chunk in guide_chunks: assert chunk["metadata"].get("last_modified_date") == "2024-10-31", ( "guide.md chunks should preserve last_modified_date" ) # Verify text content is present all_text = " ".join([c["text"] for c in chunks]) assert "first paragraph" in all_text assert "Second document" in all_text class TestErrorHandling: """Test error handling and edge cases.""" def test_text_chunking_empty_documents(self): """Test text chunking with empty document list.""" chunks = create_text_chunks([]) assert chunks == [] def test_text_chunking_invalid_parameters(self): """Test text chunking with invalid parameters.""" docs = [MockDocument("test content")] # Should handle negative chunk sizes gracefully chunks = create_text_chunks( docs, chunk_size=0, chunk_overlap=0, ast_chunk_size=0, ast_chunk_overlap=0 ) # Should still return some result assert isinstance(chunks, list) def test_create_ast_chunks_no_language(self): """Test AST chunking with documents missing language metadata.""" docs = [MockDocument("def test(): pass", "/test/script.py")] # No language set chunks = create_ast_chunks(docs) # Should fall back to traditional chunking assert isinstance(chunks, list) assert len(chunks) >= 0 # May be empty if fallback also fails def test_create_ast_chunks_empty_content(self): """Test AST chunking with empty content.""" docs = [MockDocument("", "/test/script.py", {"language": "python"})] chunks = create_ast_chunks(docs) # Should handle empty content gracefully assert isinstance(chunks, list) if __name__ == "__main__": pytest.main([__file__, "-v"])