* feat(core): Add AST-aware code chunking with astchunk integration This PR introduces intelligent code chunking that preserves semantic boundaries (functions, classes, methods) for better code understanding in RAG applications. Key Features: - AST-aware chunking for Python, Java, C#, TypeScript files - Graceful fallback to traditional chunking for unsupported languages - New specialized code RAG application for repositories - Enhanced CLI with --use-ast-chunking flag - Comprehensive test suite with integration tests Technical Implementation: - New chunking_utils.py module with enhanced chunking logic - Extended base RAG framework with AST chunking arguments - Updated document RAG with --enable-code-chunking flag - CLI integration with proper error handling and fallback Benefits: - Better semantic understanding of code structure - Improved search quality for code-related queries - Maintains backward compatibility with existing workflows - Supports mixed content (code + documentation) seamlessly Dependencies: - Added astchunk and tree-sitter parsers to pyproject.toml - All dependencies are optional - fallback works without them Testing: - Comprehensive test suite in test_astchunk_integration.py - Integration tests with document RAG - Error handling and edge case coverage Documentation: - Updated README.md with AST chunking highlights - Added ASTCHUNK_INTEGRATION.md with complete guide - Updated features.md with new capabilities * Refactored chunk utils * Remove useless import * Update README.md * Update apps/chunking/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update apps/code_rag.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix issue * apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fixes after pr review * Fix tests not passing * Fix linter error for documentation files * Update .gitignore with unwanted files --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Andy Lee <andylizf@outlook.com>
169 lines
5.4 KiB
Python
169 lines
5.4 KiB
Python
"""
|
|
Test document_rag functionality using pytest.
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def test_data_dir():
|
|
"""Return the path to test data directory."""
|
|
return Path("data")
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
)
|
|
def test_document_rag_simulated(test_data_dir):
|
|
"""Test document_rag with simulated LLM."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Use a subdirectory that doesn't exist yet to force index creation
|
|
index_dir = Path(temp_dir) / "test_index"
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--embedding-mode",
|
|
"sentence-transformers",
|
|
"--index-dir",
|
|
str(index_dir),
|
|
"--data-dir",
|
|
str(test_data_dir),
|
|
"--query",
|
|
"What is Pride and Prejudice about?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
|
|
|
|
# Check return code
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
# Verify output
|
|
output = result.stdout + result.stderr
|
|
assert "Index saved to" in output or "Using existing index" in output
|
|
assert "This is a simulated answer" in output
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true",
|
|
reason="Skip AST chunking tests in CI to avoid dependency issues",
|
|
)
|
|
def test_document_rag_with_ast_chunking(test_data_dir):
|
|
"""Test document_rag with AST-aware chunking enabled."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Use a subdirectory that doesn't exist yet to force index creation
|
|
index_dir = Path(temp_dir) / "test_ast_index"
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"simulated",
|
|
"--embedding-model",
|
|
"facebook/contriever",
|
|
"--embedding-mode",
|
|
"sentence-transformers",
|
|
"--index-dir",
|
|
str(index_dir),
|
|
"--data-dir",
|
|
str(test_data_dir),
|
|
"--enable-code-chunking", # Enable AST chunking
|
|
"--query",
|
|
"What is Pride and Prejudice about?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
|
|
|
|
# Check return code
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
# Verify output
|
|
output = result.stdout + result.stderr
|
|
assert "Index saved to" in output or "Using existing index" in output
|
|
assert "This is a simulated answer" in output
|
|
|
|
# Should mention AST chunking if code files are present
|
|
# (might not be relevant for the test data, but command should succeed)
|
|
|
|
|
|
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available")
|
|
@pytest.mark.skipif(
|
|
os.environ.get("CI") == "true", reason="Skip OpenAI tests in CI to avoid API costs"
|
|
)
|
|
def test_document_rag_openai(test_data_dir):
|
|
"""Test document_rag with OpenAI embeddings."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Use a subdirectory that doesn't exist yet to force index creation
|
|
index_dir = Path(temp_dir) / "test_index_openai"
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"simulated", # Use simulated LLM to avoid GPT-4 costs
|
|
"--embedding-model",
|
|
"text-embedding-3-small",
|
|
"--embedding-mode",
|
|
"openai",
|
|
"--index-dir",
|
|
str(index_dir),
|
|
"--data-dir",
|
|
str(test_data_dir),
|
|
"--query",
|
|
"What is Pride and Prejudice about?",
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
|
|
|
|
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
|
|
# Verify cosine distance was used
|
|
output = result.stdout + result.stderr
|
|
assert any(
|
|
msg in output
|
|
for msg in [
|
|
"distance_metric='cosine'",
|
|
"Automatically setting distance_metric='cosine'",
|
|
"Using cosine distance",
|
|
]
|
|
)
|
|
|
|
|
|
def test_document_rag_error_handling(test_data_dir):
|
|
"""Test document_rag with invalid parameters."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
cmd = [
|
|
sys.executable,
|
|
"apps/document_rag.py",
|
|
"--llm",
|
|
"invalid_llm_type",
|
|
"--index-dir",
|
|
temp_dir,
|
|
"--data-dir",
|
|
str(test_data_dir),
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
|
|
# Should fail with invalid LLM type
|
|
assert result.returncode != 0
|
|
assert "invalid choice" in result.stderr or "invalid_llm_type" in result.stderr
|