* feat(core): Add AST-aware code chunking with astchunk integration This PR introduces intelligent code chunking that preserves semantic boundaries (functions, classes, methods) for better code understanding in RAG applications. Key Features: - AST-aware chunking for Python, Java, C#, TypeScript files - Graceful fallback to traditional chunking for unsupported languages - New specialized code RAG application for repositories - Enhanced CLI with --use-ast-chunking flag - Comprehensive test suite with integration tests Technical Implementation: - New chunking_utils.py module with enhanced chunking logic - Extended base RAG framework with AST chunking arguments - Updated document RAG with --enable-code-chunking flag - CLI integration with proper error handling and fallback Benefits: - Better semantic understanding of code structure - Improved search quality for code-related queries - Maintains backward compatibility with existing workflows - Supports mixed content (code + documentation) seamlessly Dependencies: - Added astchunk and tree-sitter parsers to pyproject.toml - All dependencies are optional - fallback works without them Testing: - Comprehensive test suite in test_astchunk_integration.py - Integration tests with document RAG - Error handling and edge case coverage Documentation: - Updated README.md with AST chunking highlights - Added ASTCHUNK_INTEGRATION.md with complete guide - Updated features.md with new capabilities * Refactored chunk utils * Remove useless import * Update README.md * Update apps/chunking/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update apps/code_rag.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix issue * apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fixes after pr review * Fix tests not passing * Fix linter error for documentation files * Update .gitignore with unwanted files --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Andy Lee <andylizf@outlook.com>
398 lines
14 KiB
Python
398 lines
14 KiB
Python
"""
|
|
Test suite for astchunk integration with LEANN.
|
|
Tests AST-aware chunking functionality, language detection, and fallback mechanisms.
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
# Add apps directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "apps"))
|
|
|
|
from typing import Optional
|
|
|
|
from chunking import (
|
|
create_ast_chunks,
|
|
create_text_chunks,
|
|
create_traditional_chunks,
|
|
detect_code_files,
|
|
get_language_from_extension,
|
|
)
|
|
|
|
|
|
class MockDocument:
|
|
"""Mock LlamaIndex Document for testing."""
|
|
|
|
def __init__(self, content: str, file_path: str = "", metadata: Optional[dict] = None):
|
|
self.content = content
|
|
self.metadata = metadata or {}
|
|
if file_path:
|
|
self.metadata["file_path"] = file_path
|
|
|
|
def get_content(self) -> str:
|
|
return self.content
|
|
|
|
|
|
class TestCodeFileDetection:
|
|
"""Test code file detection and language mapping."""
|
|
|
|
def test_detect_code_files_python(self):
|
|
"""Test detection of Python files."""
|
|
docs = [
|
|
MockDocument("print('hello')", "/path/to/file.py"),
|
|
MockDocument("This is text", "/path/to/file.txt"),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 1
|
|
assert len(text_docs) == 1
|
|
assert code_docs[0].metadata["language"] == "python"
|
|
assert code_docs[0].metadata["is_code"] is True
|
|
assert text_docs[0].metadata["is_code"] is False
|
|
|
|
def test_detect_code_files_multiple_languages(self):
|
|
"""Test detection of multiple programming languages."""
|
|
docs = [
|
|
MockDocument("def func():", "/path/to/script.py"),
|
|
MockDocument("public class Test {}", "/path/to/Test.java"),
|
|
MockDocument("interface ITest {}", "/path/to/test.ts"),
|
|
MockDocument("using System;", "/path/to/Program.cs"),
|
|
MockDocument("Regular text content", "/path/to/document.txt"),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 4
|
|
assert len(text_docs) == 1
|
|
|
|
languages = [doc.metadata["language"] for doc in code_docs]
|
|
assert "python" in languages
|
|
assert "java" in languages
|
|
assert "typescript" in languages
|
|
assert "csharp" in languages
|
|
|
|
def test_detect_code_files_no_file_path(self):
|
|
"""Test handling of documents without file paths."""
|
|
docs = [
|
|
MockDocument("some content"),
|
|
MockDocument("other content", metadata={"some_key": "value"}),
|
|
]
|
|
|
|
code_docs, text_docs = detect_code_files(docs)
|
|
|
|
assert len(code_docs) == 0
|
|
assert len(text_docs) == 2
|
|
for doc in text_docs:
|
|
assert doc.metadata["is_code"] is False
|
|
|
|
def test_get_language_from_extension(self):
|
|
"""Test language detection from file extensions."""
|
|
assert get_language_from_extension("test.py") == "python"
|
|
assert get_language_from_extension("Test.java") == "java"
|
|
assert get_language_from_extension("component.tsx") == "typescript"
|
|
assert get_language_from_extension("Program.cs") == "csharp"
|
|
assert get_language_from_extension("document.txt") is None
|
|
assert get_language_from_extension("") is None
|
|
|
|
|
|
class TestChunkingFunctions:
|
|
"""Test various chunking functionality."""
|
|
|
|
def test_create_traditional_chunks(self):
|
|
"""Test traditional text chunking."""
|
|
docs = [
|
|
MockDocument(
|
|
"This is a test document. It has multiple sentences. We want to test chunking."
|
|
)
|
|
]
|
|
|
|
chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(isinstance(chunk, str) for chunk in chunks)
|
|
assert all(len(chunk.strip()) > 0 for chunk in chunks)
|
|
|
|
def test_create_traditional_chunks_empty_docs(self):
|
|
"""Test traditional chunking with empty documents."""
|
|
chunks = create_traditional_chunks([], chunk_size=50, chunk_overlap=10)
|
|
assert chunks == []
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip astchunk tests in CI - dependency may not be available",
|
|
)
|
|
def test_create_ast_chunks_with_astchunk_available(self):
|
|
"""Test AST chunking when astchunk is available."""
|
|
python_code = '''
|
|
def hello_world():
|
|
"""Print hello world message."""
|
|
print("Hello, World!")
|
|
|
|
def add_numbers(a, b):
|
|
"""Add two numbers and return the result."""
|
|
return a + b
|
|
|
|
class Calculator:
|
|
"""A simple calculator class."""
|
|
|
|
def __init__(self):
|
|
self.history = []
|
|
|
|
def add(self, a, b):
|
|
result = a + b
|
|
self.history.append(f"{a} + {b} = {result}")
|
|
return result
|
|
'''
|
|
|
|
docs = [MockDocument(python_code, "/test/calculator.py", {"language": "python"})]
|
|
|
|
try:
|
|
chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50)
|
|
|
|
# Should have multiple chunks due to different functions/classes
|
|
assert len(chunks) > 0
|
|
assert all(isinstance(chunk, str) for chunk in chunks)
|
|
assert all(len(chunk.strip()) > 0 for chunk in chunks)
|
|
|
|
# Check that code structure is somewhat preserved
|
|
combined_content = " ".join(chunks)
|
|
assert "def hello_world" in combined_content
|
|
assert "class Calculator" in combined_content
|
|
|
|
except ImportError:
|
|
# astchunk not available, should fall back to traditional chunking
|
|
chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50)
|
|
assert len(chunks) > 0 # Should still get chunks from fallback
|
|
|
|
def test_create_ast_chunks_fallback_to_traditional(self):
|
|
"""Test AST chunking falls back to traditional when astchunk is not available."""
|
|
docs = [MockDocument("def test(): pass", "/test/script.py", {"language": "python"})]
|
|
|
|
# Mock astchunk import to fail
|
|
with patch("chunking.create_ast_chunks"):
|
|
# First call (actual test) should import astchunk and potentially fail
|
|
# Let's call the actual function to test the import error handling
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should return some chunks (either from astchunk or fallback)
|
|
assert isinstance(chunks, list)
|
|
|
|
def test_create_text_chunks_traditional_mode(self):
|
|
"""Test text chunking in traditional mode."""
|
|
docs = [
|
|
MockDocument("def test(): pass", "/test/script.py"),
|
|
MockDocument("This is regular text.", "/test/doc.txt"),
|
|
]
|
|
|
|
chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(isinstance(chunk, str) for chunk in chunks)
|
|
|
|
def test_create_text_chunks_ast_mode(self):
|
|
"""Test text chunking in AST mode."""
|
|
docs = [
|
|
MockDocument("def test(): pass", "/test/script.py"),
|
|
MockDocument("This is regular text.", "/test/doc.txt"),
|
|
]
|
|
|
|
chunks = create_text_chunks(
|
|
docs,
|
|
use_ast_chunking=True,
|
|
ast_chunk_size=100,
|
|
ast_chunk_overlap=20,
|
|
chunk_size=50,
|
|
chunk_overlap=10,
|
|
)
|
|
|
|
assert len(chunks) > 0
|
|
assert all(isinstance(chunk, str) for chunk in chunks)
|
|
|
|
def test_create_text_chunks_custom_extensions(self):
|
|
"""Test text chunking with custom code file extensions."""
|
|
docs = [
|
|
MockDocument("function test() {}", "/test/script.js"), # Not in default extensions
|
|
MockDocument("Regular text", "/test/doc.txt"),
|
|
]
|
|
|
|
# First without custom extensions - should treat .js as text
|
|
chunks_without = create_text_chunks(docs, use_ast_chunking=True, code_file_extensions=None)
|
|
|
|
# Then with custom extensions - should treat .js as code
|
|
chunks_with = create_text_chunks(
|
|
docs, use_ast_chunking=True, code_file_extensions=[".js", ".jsx"]
|
|
)
|
|
|
|
# Both should return chunks
|
|
assert len(chunks_without) > 0
|
|
assert len(chunks_with) > 0
|
|
|
|
|
|
class TestIntegrationWithDocumentRAG:
|
|
"""Integration tests with the document RAG system."""
|
|
|
|
@pytest.fixture
|
|
def temp_code_dir(self):
|
|
"""Create a temporary directory with sample code files."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Create sample Python file
|
|
python_file = temp_path / "example.py"
|
|
python_file.write_text('''
|
|
def fibonacci(n):
|
|
"""Calculate fibonacci number."""
|
|
if n <= 1:
|
|
return n
|
|
return fibonacci(n-1) + fibonacci(n-2)
|
|
|
|
class MathUtils:
|
|
@staticmethod
|
|
def factorial(n):
|
|
if n <= 1:
|
|
return 1
|
|
return n * MathUtils.factorial(n-1)
|
|
''')
|
|
|
|
# Create sample text file
|
|
text_file = temp_path / "readme.txt"
|
|
text_file.write_text("This is a sample text file for testing purposes.")
|
|
|
|
yield temp_path
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip integration tests in CI to avoid dependency issues",
|
|
)
|
|
def test_document_rag_with_ast_chunking(self, temp_code_dir):
|
|
"""Test document RAG with AST chunking enabled."""
|
|
with tempfile.TemporaryDirectory() as index_dir:
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--embedding-mode",
|
|
"sentence-transformers",
|
|
"--index-dir",
|
|
index_dir,
|
|
"--data-dir",
|
|
str(temp_code_dir),
|
|
"--enable-code-chunking",
|
|
"--query",
|
|
"How does the fibonacci function work?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300, # 5 minutes
|
|
env=env,
|
|
)
|
|
|
|
# Should succeed even if astchunk is not available (fallback)
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
output = result.stdout + result.stderr
|
|
assert "Index saved to" in output or "Using existing index" in output
|
|
|
|
except subprocess.TimeoutExpired:
|
|
pytest.skip("Test timed out - likely due to model download in CI")
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip integration tests in CI to avoid dependency issues",
|
|
)
|
|
def test_code_rag_application(self, temp_code_dir):
|
|
"""Test the specialized code RAG application."""
|
|
with tempfile.TemporaryDirectory() as index_dir:
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/code_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--index-dir",
|
|
index_dir,
|
|
"--repo-dir",
|
|
str(temp_code_dir),
|
|
"--query",
|
|
"What classes are defined in this code?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
|
|
|
|
# Should succeed
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
output = result.stdout + result.stderr
|
|
assert "Using AST-aware chunking" in output or "traditional chunking" in output
|
|
|
|
except subprocess.TimeoutExpired:
|
|
pytest.skip("Test timed out - likely due to model download in CI")
|
|
|
|
|
|
class TestErrorHandling:
|
|
"""Test error handling and edge cases."""
|
|
|
|
def test_text_chunking_empty_documents(self):
|
|
"""Test text chunking with empty document list."""
|
|
chunks = create_text_chunks([])
|
|
assert chunks == []
|
|
|
|
def test_text_chunking_invalid_parameters(self):
|
|
"""Test text chunking with invalid parameters."""
|
|
docs = [MockDocument("test content")]
|
|
|
|
# Should handle negative chunk sizes gracefully
|
|
chunks = create_text_chunks(
|
|
docs, chunk_size=0, chunk_overlap=0, ast_chunk_size=0, ast_chunk_overlap=0
|
|
)
|
|
|
|
# Should still return some result
|
|
assert isinstance(chunks, list)
|
|
|
|
def test_create_ast_chunks_no_language(self):
|
|
"""Test AST chunking with documents missing language metadata."""
|
|
docs = [MockDocument("def test(): pass", "/test/script.py")] # No language set
|
|
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should fall back to traditional chunking
|
|
assert isinstance(chunks, list)
|
|
assert len(chunks) >= 0 # May be empty if fallback also fails
|
|
|
|
def test_create_ast_chunks_empty_content(self):
|
|
"""Test AST chunking with empty content."""
|
|
docs = [MockDocument("", "/test/script.py", {"language": "python"})]
|
|
|
|
chunks = create_ast_chunks(docs)
|
|
|
|
# Should handle empty content gracefully
|
|
assert isinstance(chunks, list)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|