diff --git a/.gitignore b/.gitignore index 5b4523b..05885ca 100755 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,5 @@ batchtest.py tests/__pytest_cache__/ tests/__pycache__/ paru-bin/ + +benchmarks/data/ diff --git a/README.md b/README.md index c0f3a6c..132ea6a 100755 --- a/README.md +++ b/README.md @@ -176,6 +176,9 @@ response = chat.ask("How much storage does LEANN save?", top_k=1) LEANN supports RAG on various data sources including documents (`.pdf`, `.txt`, `.md`), Apple Mail, Google Search History, WeChat, and more. +**AST-Aware Code Chunking** - LEANN also features intelligent code chunking that preserves semantic boundaries (functions, classes, methods) for Python, Java, C#, and TypeScript files, providing improved code understanding compared to traditional text-based approaches. +๐Ÿ“– Read the [AST Chunking Guide โ†’](docs/ast_chunking_guide.md) to learn more. + ### Generation Model Setup LEANN supports multiple LLM providers for text generation (OpenAI API, HuggingFace, Ollama). @@ -294,6 +297,12 @@ python -m apps.document_rag --data-dir "~/Documents/Papers" --chunk-size 1024 # Filter only markdown and Python files with smaller chunks python -m apps.document_rag --data-dir "./docs" --chunk-size 256 --file-types .md .py + +# Enable AST-aware chunking for code files +python -m apps.document_rag --enable-code-chunking --data-dir "./my_project" + +# Or use the specialized code RAG for better code understanding +python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authentication work?" ``` @@ -472,6 +481,7 @@ Once the index is built, you can ask questions like: **Key features:** - ๐Ÿ” **Semantic code search** across your entire project, fully local index and lightweight +- ๐Ÿง  **AST-aware chunking** preserves code structure (functions, classes) - ๐Ÿ“š **Context-aware assistance** for debugging and development - ๐Ÿš€ **Zero-config setup** with automatic language detection @@ -534,7 +544,8 @@ leann remove my-docs **Key CLI features:** - Auto-detects document formats (PDF, TXT, MD, DOCX, PPTX + code files) -- Smart text chunking with overlap +- **๐Ÿง  AST-aware chunking** for Python, Java, C#, TypeScript files +- Smart text chunking with overlap for all other content - Multiple LLM providers (Ollama, OpenAI, HuggingFace) - Organized index storage in `.leann/indexes/` (project-local) - Support for advanced search parameters @@ -646,6 +657,7 @@ Options: ```bash uv pip install -e ".[dev]" # Install dev dependencies python benchmarks/run_evaluation.py # Will auto-download evaluation data and run benchmarks +python benchmarks/run_evaluation.py benchmarks/data/indices/rpj_wiki/rpj_wiki --num-queries 2000 # After downloading data, you can run the benchmark with our biggest index ``` The evaluation script downloads data automatically on first run. The last three results were tested with partial personal data, and you can reproduce them with your own data! diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py index fda77e2..be1be04 100644 --- a/apps/base_rag_example.py +++ b/apps/base_rag_example.py @@ -11,7 +11,6 @@ from typing import Any import dotenv from leann.api import LeannBuilder, LeannChat from leann.registry import register_project_directory -from llama_index.core.node_parser import SentenceSplitter dotenv.load_dotenv() @@ -109,6 +108,38 @@ class BaseRAGExample(ABC): help="Thinking budget for reasoning models (low/medium/high). Supported by GPT-Oss:20b and other reasoning models.", ) + # AST Chunking parameters + ast_group = parser.add_argument_group("AST Chunking Parameters") + ast_group.add_argument( + "--use-ast-chunking", + action="store_true", + help="Enable AST-aware chunking for code files (requires astchunk)", + ) + ast_group.add_argument( + "--ast-chunk-size", + type=int, + default=512, + help="Maximum characters per AST chunk (default: 512)", + ) + ast_group.add_argument( + "--ast-chunk-overlap", + type=int, + default=64, + help="Overlap between AST chunks (default: 64)", + ) + ast_group.add_argument( + "--code-file-extensions", + nargs="+", + default=None, + help="Additional code file extensions to process with AST chunking (e.g., .py .java .cs .ts)", + ) + ast_group.add_argument( + "--ast-fallback-traditional", + action="store_true", + default=True, + help="Fall back to traditional chunking if AST chunking fails (default: True)", + ) + # Search parameters search_group = parser.add_argument_group("Search Parameters") search_group.add_argument( @@ -309,21 +340,3 @@ class BaseRAGExample(ABC): await self.run_single_query(args, index_path, args.query) else: await self.run_interactive_chat(args, index_path) - - -def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> list[str]: - """Helper function to create text chunks from documents.""" - node_parser = SentenceSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - separator=" ", - paragraph_separator="\n\n", - ) - - all_texts = [] - for doc in documents: - nodes = node_parser.get_nodes_from_documents([doc]) - if nodes: - all_texts.extend(node.get_content() for node in nodes) - - return all_texts diff --git a/apps/chunking/__init__.py b/apps/chunking/__init__.py new file mode 100644 index 0000000..3cd5c0d --- /dev/null +++ b/apps/chunking/__init__.py @@ -0,0 +1,22 @@ +""" +Chunking utilities for LEANN RAG applications. +Provides AST-aware and traditional text chunking functionality. +""" + +from .utils import ( + CODE_EXTENSIONS, + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, +) + +__all__ = [ + "CODE_EXTENSIONS", + "create_ast_chunks", + "create_text_chunks", + "create_traditional_chunks", + "detect_code_files", + "get_language_from_extension", +] diff --git a/apps/chunking/utils.py b/apps/chunking/utils.py new file mode 100644 index 0000000..9a19c63 --- /dev/null +++ b/apps/chunking/utils.py @@ -0,0 +1,320 @@ +""" +Enhanced chunking utilities with AST-aware code chunking support. +Provides unified interface for both traditional and AST-based text chunking. +""" + +import logging +from pathlib import Path +from typing import Optional + +from llama_index.core.node_parser import SentenceSplitter + +logger = logging.getLogger(__name__) + +# Code file extensions supported by astchunk +CODE_EXTENSIONS = { + ".py": "python", + ".java": "java", + ".cs": "csharp", + ".ts": "typescript", + ".tsx": "typescript", + ".js": "typescript", + ".jsx": "typescript", +} + +# Default chunk parameters for different content types +DEFAULT_CHUNK_PARAMS = { + "code": { + "max_chunk_size": 512, + "chunk_overlap": 64, + }, + "text": { + "chunk_size": 256, + "chunk_overlap": 128, + }, +} + + +def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: + """ + Separate documents into code files and regular text files. + + Args: + documents: List of LlamaIndex Document objects + code_extensions: Dict mapping file extensions to languages (defaults to CODE_EXTENSIONS) + + Returns: + Tuple of (code_documents, text_documents) + """ + if code_extensions is None: + code_extensions = CODE_EXTENSIONS + + code_docs = [] + text_docs = [] + + for doc in documents: + # Get file path from metadata + file_path = doc.metadata.get("file_path", "") + if not file_path: + # Fallback to file_name + file_path = doc.metadata.get("file_name", "") + + if file_path: + file_ext = Path(file_path).suffix.lower() + if file_ext in code_extensions: + # Add language info to metadata + doc.metadata["language"] = code_extensions[file_ext] + doc.metadata["is_code"] = True + code_docs.append(doc) + else: + doc.metadata["is_code"] = False + text_docs.append(doc) + else: + # If no file path, treat as text + doc.metadata["is_code"] = False + text_docs.append(doc) + + logger.info(f"Detected {len(code_docs)} code files and {len(text_docs)} text files") + return code_docs, text_docs + + +def get_language_from_extension(file_path: str) -> Optional[str]: + """Get the programming language from file extension.""" + ext = Path(file_path).suffix.lower() + return CODE_EXTENSIONS.get(ext) + + +def create_ast_chunks( + documents, + max_chunk_size: int = 512, + chunk_overlap: int = 64, + metadata_template: str = "default", +) -> list[str]: + """ + Create AST-aware chunks from code documents using astchunk. + + Args: + documents: List of code documents + max_chunk_size: Maximum characters per chunk + chunk_overlap: Number of AST nodes to overlap between chunks + metadata_template: Template for chunk metadata + + Returns: + List of text chunks with preserved code structure + """ + try: + from astchunk import ASTChunkBuilder + except ImportError as e: + logger.error(f"astchunk not available: {e}") + logger.info("Falling back to traditional chunking for code files") + return create_traditional_chunks(documents, max_chunk_size, chunk_overlap) + + all_chunks = [] + + for doc in documents: + # Get language from metadata (set by detect_code_files) + language = doc.metadata.get("language") + if not language: + logger.warning( + "No language detected for document, falling back to traditional chunking" + ) + traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) + all_chunks.extend(traditional_chunks) + continue + + try: + # Configure astchunk + configs = { + "max_chunk_size": max_chunk_size, + "language": language, + "metadata_template": metadata_template, + "chunk_overlap": chunk_overlap if chunk_overlap > 0 else 0, + } + + # Add repository-level metadata if available + repo_metadata = { + "file_path": doc.metadata.get("file_path", ""), + "file_name": doc.metadata.get("file_name", ""), + "creation_date": doc.metadata.get("creation_date", ""), + "last_modified_date": doc.metadata.get("last_modified_date", ""), + } + configs["repo_level_metadata"] = repo_metadata + + # Create chunk builder and process + chunk_builder = ASTChunkBuilder(**configs) + code_content = doc.get_content() + + if not code_content or not code_content.strip(): + logger.warning("Empty code content, skipping") + continue + + chunks = chunk_builder.chunkify(code_content) + + # Extract text content from chunks + for chunk in chunks: + if hasattr(chunk, "text"): + chunk_text = chunk.text + elif isinstance(chunk, dict) and "text" in chunk: + chunk_text = chunk["text"] + elif isinstance(chunk, str): + chunk_text = chunk + else: + # Try to convert to string + chunk_text = str(chunk) + + if chunk_text and chunk_text.strip(): + all_chunks.append(chunk_text.strip()) + + logger.info( + f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}" + ) + + except Exception as e: + logger.warning(f"AST chunking failed for {language} file: {e}") + logger.info("Falling back to traditional chunking") + traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) + all_chunks.extend(traditional_chunks) + + return all_chunks + + +def create_traditional_chunks( + documents, chunk_size: int = 256, chunk_overlap: int = 128 +) -> list[str]: + """ + Create traditional text chunks using LlamaIndex SentenceSplitter. + + Args: + documents: List of documents to chunk + chunk_size: Size of each chunk in characters + chunk_overlap: Overlap between chunks + + Returns: + List of text chunks + """ + # Handle invalid chunk_size values + if chunk_size <= 0: + logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256") + chunk_size = 256 + + # Ensure chunk_overlap is not negative and not larger than chunk_size + if chunk_overlap < 0: + chunk_overlap = 0 + if chunk_overlap >= chunk_size: + chunk_overlap = chunk_size // 2 + + node_parser = SentenceSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separator=" ", + paragraph_separator="\n\n", + ) + + all_texts = [] + for doc in documents: + try: + nodes = node_parser.get_nodes_from_documents([doc]) + if nodes: + chunk_texts = [node.get_content() for node in nodes] + all_texts.extend(chunk_texts) + logger.debug(f"Created {len(chunk_texts)} traditional chunks from document") + except Exception as e: + logger.error(f"Traditional chunking failed for document: {e}") + # As last resort, add the raw content + content = doc.get_content() + if content and content.strip(): + all_texts.append(content.strip()) + + return all_texts + + +def create_text_chunks( + documents, + chunk_size: int = 256, + chunk_overlap: int = 128, + use_ast_chunking: bool = False, + ast_chunk_size: int = 512, + ast_chunk_overlap: int = 64, + code_file_extensions: Optional[list[str]] = None, + ast_fallback_traditional: bool = True, +) -> list[str]: + """ + Create text chunks from documents with optional AST support for code files. + + Args: + documents: List of LlamaIndex Document objects + chunk_size: Size for traditional text chunks + chunk_overlap: Overlap for traditional text chunks + use_ast_chunking: Whether to use AST chunking for code files + ast_chunk_size: Size for AST chunks + ast_chunk_overlap: Overlap for AST chunks + code_file_extensions: Custom list of code file extensions + ast_fallback_traditional: Fall back to traditional chunking on AST errors + + Returns: + List of text chunks + """ + if not documents: + logger.warning("No documents provided for chunking") + return [] + + # Create a local copy of supported extensions for this function call + local_code_extensions = CODE_EXTENSIONS.copy() + + # Update supported extensions if provided + if code_file_extensions: + # Map extensions to languages (simplified mapping) + ext_mapping = { + ".py": "python", + ".java": "java", + ".cs": "c_sharp", + ".ts": "typescript", + ".tsx": "typescript", + } + for ext in code_file_extensions: + if ext.lower() not in local_code_extensions: + # Try to guess language from extension + if ext.lower() in ext_mapping: + local_code_extensions[ext.lower()] = ext_mapping[ext.lower()] + else: + logger.warning(f"Unsupported extension {ext}, will use traditional chunking") + + all_chunks = [] + + if use_ast_chunking: + # Separate code and text documents using local extensions + code_docs, text_docs = detect_code_files(documents, local_code_extensions) + + # Process code files with AST chunking + if code_docs: + logger.info(f"Processing {len(code_docs)} code files with AST chunking") + try: + ast_chunks = create_ast_chunks( + code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap + ) + all_chunks.extend(ast_chunks) + logger.info(f"Created {len(ast_chunks)} AST chunks from code files") + except Exception as e: + logger.error(f"AST chunking failed: {e}") + if ast_fallback_traditional: + logger.info("Falling back to traditional chunking for code files") + traditional_code_chunks = create_traditional_chunks( + code_docs, chunk_size, chunk_overlap + ) + all_chunks.extend(traditional_code_chunks) + else: + raise + + # Process text files with traditional chunking + if text_docs: + logger.info(f"Processing {len(text_docs)} text files with traditional chunking") + text_chunks = create_traditional_chunks(text_docs, chunk_size, chunk_overlap) + all_chunks.extend(text_chunks) + logger.info(f"Created {len(text_chunks)} traditional chunks from text files") + else: + # Use traditional chunking for all files + logger.info(f"Processing {len(documents)} documents with traditional chunking") + all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap) + + logger.info(f"Total chunks created: {len(all_chunks)}") + return all_chunks diff --git a/apps/code_rag.py b/apps/code_rag.py new file mode 100644 index 0000000..7518bb9 --- /dev/null +++ b/apps/code_rag.py @@ -0,0 +1,211 @@ +""" +Code RAG example using AST-aware chunking for optimal code understanding. +Specialized for code repositories with automatic language detection and +optimized chunking parameters. +""" + +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample +from chunking import CODE_EXTENSIONS, create_text_chunks +from llama_index.core import SimpleDirectoryReader + + +class CodeRAG(BaseRAGExample): + """Specialized RAG example for code repositories with AST-aware chunking.""" + + def __init__(self): + super().__init__( + name="Code", + description="Process and query code repositories with AST-aware chunking", + default_index_name="code_index", + ) + # Override defaults for code-specific usage + self.embedding_model_default = "facebook/contriever" # Good for code + self.max_items_default = -1 # Process all code files by default + + def _add_specific_arguments(self, parser): + """Add code-specific arguments.""" + code_group = parser.add_argument_group("Code Repository Parameters") + + code_group.add_argument( + "--repo-dir", + type=str, + default=".", + help="Code repository directory to index (default: current directory)", + ) + code_group.add_argument( + "--include-extensions", + nargs="+", + default=list(CODE_EXTENSIONS.keys()), + help="File extensions to include (default: supported code extensions)", + ) + code_group.add_argument( + "--exclude-dirs", + nargs="+", + default=[ + ".git", + "__pycache__", + "node_modules", + "venv", + ".venv", + "build", + "dist", + "target", + ], + help="Directories to exclude from indexing", + ) + code_group.add_argument( + "--max-file-size", + type=int, + default=1000000, # 1MB + help="Maximum file size in bytes to process (default: 1MB)", + ) + code_group.add_argument( + "--include-comments", + action="store_true", + help="Include comments in chunking (useful for documentation)", + ) + code_group.add_argument( + "--preserve-imports", + action="store_true", + default=True, + help="Try to preserve import statements in chunks (default: True)", + ) + + async def load_data(self, args) -> list[str]: + """Load code files and convert to AST-aware chunks.""" + print(f"๐Ÿ” Scanning code repository: {args.repo_dir}") + print(f"๐Ÿ“ Including extensions: {args.include_extensions}") + print(f"๐Ÿšซ Excluding directories: {args.exclude_dirs}") + + # Check if repository directory exists + repo_path = Path(args.repo_dir) + if not repo_path.exists(): + raise ValueError(f"Repository directory not found: {args.repo_dir}") + + # Load code files with filtering + reader_kwargs = { + "recursive": True, + "encoding": "utf-8", + "required_exts": args.include_extensions, + "exclude_hidden": True, + } + + # Create exclusion filter + def file_filter(file_path: str) -> bool: + """Filter out unwanted files and directories.""" + path = Path(file_path) + + # Check file size + try: + if path.stat().st_size > args.max_file_size: + print(f"โš ๏ธ Skipping large file: {path.name} ({path.stat().st_size} bytes)") + return False + except Exception: + return False + + # Check if in excluded directory + for exclude_dir in args.exclude_dirs: + if exclude_dir in path.parts: + return False + + return True + + try: + # Load documents with file filtering + documents = SimpleDirectoryReader( + args.repo_dir, + file_extractor=None, # Use default extractors + **reader_kwargs, + ).load_data(show_progress=True) + + # Apply custom filtering + filtered_docs = [] + for doc in documents: + file_path = doc.metadata.get("file_path", "") + if file_filter(file_path): + filtered_docs.append(doc) + + documents = filtered_docs + + except Exception as e: + print(f"โŒ Error loading code files: {e}") + return [] + + if not documents: + print( + f"โŒ No code files found in {args.repo_dir} with extensions {args.include_extensions}" + ) + return [] + + print(f"โœ… Loaded {len(documents)} code files") + + # Show breakdown by language/extension + ext_counts = {} + for doc in documents: + file_path = doc.metadata.get("file_path", "") + if file_path: + ext = Path(file_path).suffix.lower() + ext_counts[ext] = ext_counts.get(ext, 0) + 1 + + print("๐Ÿ“Š Files by extension:") + for ext, count in sorted(ext_counts.items()): + print(f" {ext}: {count} files") + + # Use AST-aware chunking by default for code + print( + f"๐Ÿง  Using AST-aware chunking (chunk_size: {args.ast_chunk_size}, overlap: {args.ast_chunk_overlap})" + ) + + all_texts = create_text_chunks( + documents, + chunk_size=256, # Fallback for non-code files + chunk_overlap=64, + use_ast_chunking=True, # Always use AST for code RAG + ast_chunk_size=args.ast_chunk_size, + ast_chunk_overlap=args.ast_chunk_overlap, + code_file_extensions=args.include_extensions, + ast_fallback_traditional=True, + ) + + # Apply max_items limit if specified + if args.max_items > 0 and len(all_texts) > args.max_items: + print(f"โณ Limiting to {args.max_items} chunks (from {len(all_texts)})") + all_texts = all_texts[: args.max_items] + + print(f"โœ… Generated {len(all_texts)} code chunks") + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Example queries for code RAG + print("\n๐Ÿ’ป Code RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'How does the embedding computation work?'") + print("- 'What are the main classes in this codebase?'") + print("- 'Show me the search implementation'") + print("- 'How is error handling implemented?'") + print("- 'What design patterns are used?'") + print("- 'Explain the chunking logic'") + print("\n๐Ÿš€ Features:") + print("- โœ… AST-aware chunking preserves code structure") + print("- โœ… Automatic language detection") + print("- โœ… Smart filtering of large files and common excludes") + print("- โœ… Optimized for code understanding") + print("\nUsage examples:") + print(" python -m apps.code_rag --repo-dir ./my_project") + print( + " python -m apps.code_rag --include-extensions .py .js --query 'How does authentication work?'" + ) + print("\nOr run without --query for interactive mode\n") + + rag = CodeRAG() + asyncio.run(rag.run()) diff --git a/apps/document_rag.py b/apps/document_rag.py index 1ac15c7..8472f6f 100644 --- a/apps/document_rag.py +++ b/apps/document_rag.py @@ -9,7 +9,8 @@ from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) -from base_rag_example import BaseRAGExample, create_text_chunks +from base_rag_example import BaseRAGExample +from chunking import create_text_chunks from llama_index.core import SimpleDirectoryReader @@ -44,6 +45,11 @@ class DocumentRAG(BaseRAGExample): doc_group.add_argument( "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" ) + doc_group.add_argument( + "--enable-code-chunking", + action="store_true", + help="Enable AST-aware chunking for code files in the data directory", + ) async def load_data(self, args) -> list[str]: """Load documents and convert to text chunks.""" @@ -76,9 +82,22 @@ class DocumentRAG(BaseRAGExample): print(f"Loaded {len(documents)} documents") - # Convert to text chunks + # Determine chunking strategy + use_ast = args.enable_code_chunking or getattr(args, "use_ast_chunking", False) + + if use_ast: + print("Using AST-aware chunking for code files") + + # Convert to text chunks with optional AST support all_texts = create_text_chunks( - documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + documents, + chunk_size=args.chunk_size, + chunk_overlap=args.chunk_overlap, + use_ast_chunking=use_ast, + ast_chunk_size=getattr(args, "ast_chunk_size", 512), + ast_chunk_overlap=getattr(args, "ast_chunk_overlap", 64), + code_file_extensions=getattr(args, "code_file_extensions", None), + ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True), ) # Apply max_items limit if specified @@ -102,6 +121,10 @@ if __name__ == "__main__": print( "- 'What is the problem of developing pan gu model Huawei meets? (็›˜ๅคๅคงๆจกๅž‹ๅผ€ๅ‘ไธญ้‡ๅˆฐไป€ไนˆ้—ฎ้ข˜?)'" ) + print("\n๐Ÿš€ NEW: Code-aware chunking available!") + print("- Use --enable-code-chunking to enable AST-aware chunking for code files") + print("- Supports Python, Java, C#, TypeScript files") + print("- Better semantic understanding of code structure") print("\nOr run without --query for interactive mode\n") rag = DocumentRAG() diff --git a/benchmarks/data/.gitattributes b/benchmarks/data/.gitattributes deleted file mode 100644 index 4fb7c03..0000000 --- a/benchmarks/data/.gitattributes +++ /dev/null @@ -1,82 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mds filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text -# Video files - compressed -*.mp4 filter=lfs diff=lfs merge=lfs -text -*.webm filter=lfs diff=lfs merge=lfs -text -ground_truth/dpr/id_map.json filter=lfs diff=lfs merge=lfs -text -indices/dpr/dpr_diskann.passages.idx filter=lfs diff=lfs merge=lfs -text -indices/dpr/dpr_diskann.passages.jsonl filter=lfs diff=lfs merge=lfs -text -indices/dpr/dpr_diskann_disk.index filter=lfs diff=lfs merge=lfs -text -indices/dpr/leann.labels.map filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/leann.labels.map filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.index filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.0.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.0.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.1.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.1.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.2.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.2.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.3.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.3.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.4.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.4.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.5.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.5.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.6.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.6.jsonl filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.7.idx filter=lfs diff=lfs merge=lfs -text -indices/rpj_wiki/rpj_wiki.passages.7.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/docs/ast_chunking_guide.md b/docs/ast_chunking_guide.md new file mode 100644 index 0000000..dd5be37 --- /dev/null +++ b/docs/ast_chunking_guide.md @@ -0,0 +1,128 @@ +# AST-Aware Code chunking guide + +## Overview + +This guide covers best practices for using AST-aware code chunking in LEANN. AST chunking provides better semantic understanding of code structure compared to traditional text-based chunking. + +## Quick Start + +### Basic Usage + +```bash +# Enable AST chunking for mixed content (code + docs) +python -m apps.document_rag --enable-code-chunking --data-dir ./my_project + +# Specialized code repository indexing +python -m apps.code_rag --repo-dir ./my_codebase + +# Global CLI with AST support +leann build my-code-index --docs ./src --use-ast-chunking +``` + +### Installation + +```bash +# Install LEANN with AST chunking support +uv pip install -e "." +``` + +## Best Practices + +### When to Use AST Chunking + +โœ… **Recommended for:** +- Code repositories with multiple languages +- Mixed documentation and code content +- Complex codebases with deep function/class hierarchies +- When working with Claude Code for code assistance + +โŒ **Not recommended for:** +- Pure text documents +- Very large files (>1MB) +- Languages not supported by tree-sitter + +### Optimal Configuration + +```bash +# Recommended settings for most codebases +python -m apps.code_rag \ + --repo-dir ./src \ + --ast-chunk-size 768 \ + --ast-chunk-overlap 96 \ + --exclude-dirs .git __pycache__ node_modules build dist +``` + +### Supported Languages + +| Extension | Language | Status | +|-----------|----------|--------| +| `.py` | Python | โœ… Full support | +| `.java` | Java | โœ… Full support | +| `.cs` | C# | โœ… Full support | +| `.ts`, `.tsx` | TypeScript | โœ… Full support | +| `.js`, `.jsx` | JavaScript | โœ… Via TypeScript parser | + +## Integration Examples + +### Document RAG with Code Support + +```python +# Enable code chunking in document RAG +python -m apps.document_rag \ + --enable-code-chunking \ + --data-dir ./project \ + --query "How does authentication work in the codebase?" +``` + +### Claude Code Integration + +When using with Claude Code MCP server, AST chunking provides better context for: +- Code completion and suggestions +- Bug analysis and debugging +- Architecture understanding +- Refactoring assistance + +## Troubleshooting + +### Common Issues + +1. **Fallback to Traditional Chunking** + - Normal behavior for unsupported languages + - Check logs for specific language support + +2. **Performance with Large Files** + - Adjust `--max-file-size` parameter + - Use `--exclude-dirs` to skip unnecessary directories + +3. **Quality Issues** + - Try different `--ast-chunk-size` values (512, 768, 1024) + - Adjust overlap for better context preservation + +### Debug Mode + +```bash +export LEANN_LOG_LEVEL=DEBUG +python -m apps.code_rag --repo-dir ./my_code +``` + +## Migration from Traditional Chunking + +Existing workflows continue to work without changes. To enable AST chunking: + +```bash +# Before +python -m apps.document_rag --chunk-size 256 + +# After (maintains traditional chunking for non-code files) +python -m apps.document_rag --enable-code-chunking --chunk-size 256 --ast-chunk-size 768 +``` + +## References + +- [astchunk GitHub Repository](https://github.com/yilinjz/astchunk) +- [LEANN MCP Integration](../packages/leann-mcp/README.md) +- [Research Paper](https://arxiv.org/html/2506.15655v1) + +--- + +**Note**: AST chunking maintains full backward compatibility while enhancing code understanding capabilities. diff --git a/docs/features.md b/docs/features.md index da4e495..0a7f9dc 100644 --- a/docs/features.md +++ b/docs/features.md @@ -3,6 +3,7 @@ ## ๐Ÿ”ฅ Core Features - **๐Ÿ”„ Real-time Embeddings** - Eliminate heavy embedding storage with dynamic computation using optimized ZMQ servers and highly optimized search paradigm (overlapping and batching) with highly optimized embedding engine +- **๐Ÿง  AST-Aware Code Chunking** - Intelligent code chunking that preserves semantic boundaries (functions, classes, methods) for Python, Java, C#, and TypeScript files - **๐Ÿ“ˆ Scalable Architecture** - Handles millions of documents on consumer hardware; the larger your dataset, the more LEANN can save - **๐ŸŽฏ Graph Pruning** - Advanced techniques to minimize the storage overhead of vector search to a limited footprint - **๐Ÿ—๏ธ Pluggable Backends** - HNSW/FAISS (default), with optional DiskANN for large-scale deployments diff --git a/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py b/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py index 2b1e326..8389ddf 100644 --- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py +++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py @@ -83,9 +83,7 @@ def create_diskann_embedding_server( logger.info(f"Loading PassageManager with metadata_file_path: {passages_file}") passages = PassageManager(meta["passage_sources"], metadata_file_path=passages_file) - logger.info( - f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata" - ) + logger.info(f"Loaded PassageManager with {len(passages)} passages from metadata") # Import protobuf after ensuring the path is correct try: diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml index 68abe56..a98396a 100644 --- a/packages/leann-backend-diskann/pyproject.toml +++ b/packages/leann-backend-diskann/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-diskann" -version = "0.3.0" -dependencies = ["leann-core==0.3.0", "numpy", "protobuf>=3.19.0"] +version = "0.3.2" +dependencies = ["leann-core==0.3.2", "numpy", "protobuf>=3.19.0"] [tool.scikit-build] # Key: simplified CMake path diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py index 7c472ad..1a2dc29 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py @@ -90,9 +90,7 @@ def create_hnsw_embedding_server( embedding_dim: int = int(meta.get("dimensions", 0)) except Exception: embedding_dim = 0 - logger.info( - f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata" - ) + logger.info(f"Loaded PassageManager with {len(passages)} passages from metadata") # (legacy ZMQ thread removed; using shutdown-capable server only) diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml index cf37549..0543bb3 100644 --- a/packages/leann-backend-hnsw/pyproject.toml +++ b/packages/leann-backend-hnsw/pyproject.toml @@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-hnsw" -version = "0.3.0" +version = "0.3.2" description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." dependencies = [ - "leann-core==0.3.0", + "leann-core==0.3.2", "numpy", "pyzmq>=23.0.0", "msgpack>=1.0.0", diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index 1879513..c47aa90 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann-core" -version = "0.3.0" +version = "0.3.2" description = "Core API and plugin system for LEANN" readme = "README.md" requires-python = ">=3.9" diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index d1e66ec..47f1d5a 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -119,9 +119,12 @@ class PassageManager: def __init__( self, passage_sources: list[dict[str, Any]], metadata_file_path: Optional[str] = None ): - self.offset_maps = {} - self.passage_files = {} - self.global_offset_map = {} # Combined map for fast lookup + self.offset_maps: dict[str, dict[str, int]] = {} + self.passage_files: dict[str, str] = {} + # Avoid materializing a single gigantic global map to reduce memory + # footprint on very large corpora (e.g., 60M+ passages). Instead, keep + # per-shard maps and do a lightweight per-shard lookup on demand. + self._total_count: int = 0 # Derive index base name for standard sibling fallbacks, e.g., .passages.* index_name_base = None @@ -142,12 +145,25 @@ class PassageManager: default_name: Optional[str], source_dict: dict[str, Any], ) -> list[Path]: + """ + Build an ordered list of candidate paths. For relative paths specified in + metadata, prefer resolution relative to the metadata file directory first, + then fall back to CWD-based resolution, and finally to conventional + sibling defaults (e.g., .passages.idx / .jsonl). + """ candidates: list[Path] = [] - # 1) Primary as-is (absolute or relative) + # 1) Primary path if primary: p = Path(primary) - candidates.append(p if p.is_absolute() else (Path.cwd() / p)) - # 2) metadata-relative explicit relative key + if p.is_absolute(): + candidates.append(p) + else: + # Prefer metadata-relative resolution for relative paths + if metadata_file_path: + candidates.append(Path(metadata_file_path).parent / p) + # Also consider CWD-relative as a fallback for legacy layouts + candidates.append(Path.cwd() / p) + # 2) metadata-relative explicit relative key (if present) if metadata_file_path and source_dict.get(relative_key): candidates.append(Path(metadata_file_path).parent / source_dict[relative_key]) # 3) metadata-relative standard sibling filename @@ -177,23 +193,28 @@ class PassageManager: raise FileNotFoundError(f"Passage index file not found: {index_file}") with open(index_file, "rb") as f: - offset_map = pickle.load(f) + offset_map: dict[str, int] = pickle.load(f) self.offset_maps[passage_file] = offset_map self.passage_files[passage_file] = passage_file - - # Build global map for O(1) lookup - for passage_id, offset in offset_map.items(): - self.global_offset_map[passage_id] = (passage_file, offset) + self._total_count += len(offset_map) def get_passage(self, passage_id: str) -> dict[str, Any]: - if passage_id in self.global_offset_map: - passage_file, offset = self.global_offset_map[passage_id] - # Lazy file opening - only open when needed - with open(passage_file, encoding="utf-8") as f: - f.seek(offset) - return json.loads(f.readline()) + # Fast path: check each shard map (there are typically few shards). + # This avoids building a massive combined dict while keeping lookups + # bounded by the number of shards. + for passage_file, offset_map in self.offset_maps.items(): + try: + offset = offset_map[passage_id] + with open(passage_file, encoding="utf-8") as f: + f.seek(offset) + return json.loads(f.readline()) + except KeyError: + continue raise KeyError(f"Passage ID not found: {passage_id}") + def __len__(self) -> int: + return self._total_count + class LeannBuilder: def __init__( @@ -587,7 +608,9 @@ class LeannSearcher: logger.info(f" Additional kwargs: {kwargs}") # Smart top_k detection and adjustment - total_docs = len(self.passage_manager.global_offset_map) + # Use PassageManager length (sum of shard sizes) to avoid + # depending on a massive combined map + total_docs = len(self.passage_manager) original_top_k = top_k if top_k > total_docs: top_k = total_docs diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 36a03af..5a2611a 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1,7 +1,8 @@ import argparse import asyncio +import sys from pathlib import Path -from typing import Optional, Union +from typing import Any, Optional, Union from llama_index.core import SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter @@ -180,6 +181,29 @@ Examples: default=50, help="Code chunk overlap (default: 50)", ) + build_parser.add_argument( + "--use-ast-chunking", + action="store_true", + help="Enable AST-aware chunking for code files (requires astchunk)", + ) + build_parser.add_argument( + "--ast-chunk-size", + type=int, + default=768, + help="AST chunk size in characters (default: 768)", + ) + build_parser.add_argument( + "--ast-chunk-overlap", + type=int, + default=96, + help="AST chunk overlap in characters (default: 96)", + ) + build_parser.add_argument( + "--ast-fallback-traditional", + action="store_true", + default=True, + help="Fall back to traditional chunking if AST chunking fails (default: True)", + ) # Search command search_parser = subparsers.add_parser("search", help="Search documents") @@ -206,6 +230,11 @@ Examples: default="global", help="Pruning strategy (default: global)", ) + search_parser.add_argument( + "--non-interactive", + action="store_true", + help="Non-interactive mode: automatically select index without prompting", + ) # Ask command ask_parser = subparsers.add_parser("ask", help="Ask questions") @@ -405,13 +434,9 @@ Examples: print("๐Ÿ’ก Get started:") print(" leann build my-docs --docs ./documents") else: - projects_count = len( - [ - p - for p in valid_projects - if (p / ".leann" / "indexes").exists() - and list((p / ".leann" / "indexes").iterdir()) - ] + # Count only projects that have at least one discoverable index + projects_count = sum( + 1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0 ) print(f"๐Ÿ“Š Total: {total_indexes} indexes across {projects_count} projects") @@ -461,26 +486,35 @@ Examples: ) # 2. Apps format: *.leann.meta.json files anywhere in the project + cli_indexes_dir = project_path / ".leann" / "indexes" for meta_file in project_path.rglob("*.leann.meta.json"): if meta_file.is_file(): - # Extract index name from filename (remove .leann.meta.json extension) - index_name = meta_file.name.replace(".leann.meta.json", "") + # Skip CLI-built indexes (which store meta under .leann/indexes//) + try: + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: + continue + except Exception: + pass + # Use the parent directory name as the app index display name + display_name = meta_file.parent.name + # Extract file base used to store files + file_base = meta_file.name.replace(".leann.meta.json", "") # Apps indexes are considered complete if the .leann.meta.json file exists status = "โœ…" - # Calculate total size of all related files + # Calculate total size of all related files (use file base) size_mb = 0 try: index_dir = meta_file.parent - for related_file in index_dir.glob(f"{index_name}.leann*"): + for related_file in index_dir.glob(f"{file_base}.leann*"): size_mb += related_file.stat().st_size / (1024 * 1024) except (OSError, PermissionError): pass indexes.append( { - "name": index_name, + "name": display_name, "type": "app", "status": status, "size_mb": size_mb, @@ -534,13 +568,79 @@ Examples: if not project_path.exists(): continue + # 1) CLI-format index under .leann/indexes/ index_dir = project_path / ".leann" / "indexes" / index_name if index_dir.exists(): is_current = project_path == current_path matches.append( - {"project_path": project_path, "index_dir": index_dir, "is_current": is_current} + { + "project_path": project_path, + "index_dir": index_dir, + "is_current": is_current, + "kind": "cli", + } ) + # 2) App-format indexes + # We support two ways of addressing apps: + # a) by the file base (e.g., `pdf_documents`) + # b) by the parent directory name (e.g., `new_txt`) + seen_app_meta = set() + + # 2a) by file base + for meta_file in project_path.rglob(f"{index_name}.leann.meta.json"): + if meta_file.is_file(): + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: + continue + except Exception: + pass + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": meta_file.parent.name, + "file_base": meta_file.name.replace(".leann.meta.json", ""), + } + ) + + # 2b) by parent directory name + for meta_file in project_path.rglob("*.leann.meta.json"): + if meta_file.is_file() and meta_file.parent.name == index_name: + # Skip CLI-built indexes' meta under .leann/indexes + try: + cli_indexes_dir = project_path / ".leann" / "indexes" + if cli_indexes_dir.exists() and cli_indexes_dir in meta_file.parents: + continue + except Exception: + pass + is_current = project_path == current_path + key = (str(project_path), str(meta_file)) + if key in seen_app_meta: + continue + seen_app_meta.add(key) + matches.append( + { + "project_path": project_path, + "files_dir": meta_file.parent, + "meta_file": meta_file, + "is_current": is_current, + "kind": "app", + "display_name": meta_file.parent.name, + "file_base": meta_file.name.replace(".leann.meta.json", ""), + } + ) + # Sort: current project first, then by project name matches.sort(key=lambda x: (not x["is_current"], x["project_path"].name)) return matches @@ -548,8 +648,8 @@ Examples: def _remove_single_match(self, match, index_name: str, force: bool): """Handle removal when only one match is found""" project_path = match["project_path"] - index_dir = match["index_dir"] is_current = match["is_current"] + kind = match.get("kind", "cli") if is_current: location_info = "current project" @@ -560,7 +660,10 @@ Examples: print(f"โœ… Found 1 index named '{index_name}':") print(f" {emoji} Location: {location_info}") - print(f" ๐Ÿ“ Path: {project_path}") + if kind == "cli": + print(f" ๐Ÿ“ Path: {project_path / '.leann' / 'indexes' / index_name}") + else: + print(f" ๐Ÿ“ Meta: {match['meta_file']}") if not force: if not is_current: @@ -572,9 +675,22 @@ Examples: print(" โŒ Removal cancelled.") return False - return self._delete_index_directory( - index_dir, index_name, project_path if not is_current else None - ) + if kind == "cli": + return self._delete_index_directory( + match["index_dir"], + index_name, + project_path if not is_current else None, + is_app=False, + ) + else: + return self._delete_index_directory( + match["files_dir"], + match.get("display_name", index_name), + project_path if not is_current else None, + is_app=True, + meta_file=match.get("meta_file"), + app_file_base=match.get("file_base"), + ) def _remove_from_multiple_matches(self, matches, index_name: str, force: bool): """Handle removal when multiple matches are found""" @@ -585,19 +701,34 @@ Examples: for i, match in enumerate(matches, 1): project_path = match["project_path"] is_current = match["is_current"] + kind = match.get("kind", "cli") if is_current: - print(f" {i}. ๐Ÿ  Current project") - print(f" ๐Ÿ“ {project_path}") + print(f" {i}. ๐Ÿ  Current project ({'CLI' if kind == 'cli' else 'APP'})") else: - print(f" {i}. ๐Ÿ“‚ {project_path.name}") - print(f" ๐Ÿ“ {project_path}") + print(f" {i}. ๐Ÿ“‚ {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})") + + # Show path details + if kind == "cli": + print(f" ๐Ÿ“ {project_path / '.leann' / 'indexes' / index_name}") + else: + print(f" ๐Ÿ“ {match['meta_file']}") # Show size info try: - size_mb = sum( - f.stat().st_size for f in match["index_dir"].iterdir() if f.is_file() - ) / (1024 * 1024) + if kind == "cli": + size_mb = sum( + f.stat().st_size for f in match["index_dir"].iterdir() if f.is_file() + ) / (1024 * 1024) + else: + file_base = match.get("file_base") + size_mb = 0.0 + if file_base: + size_mb = sum( + f.stat().st_size + for f in match["files_dir"].glob(f"{file_base}.leann*") + if f.is_file() + ) / (1024 * 1024) print(f" ๐Ÿ“ฆ Size: {size_mb:.1f} MB") except (OSError, PermissionError): pass @@ -621,8 +752,8 @@ Examples: if 0 <= choice_idx < len(matches): selected_match = matches[choice_idx] project_path = selected_match["project_path"] - index_dir = selected_match["index_dir"] is_current = selected_match["is_current"] + kind = selected_match.get("kind", "cli") location = "current project" if is_current else f"'{project_path.name}' project" print(f" ๐ŸŽฏ Selected: Remove from {location}") @@ -635,9 +766,22 @@ Examples: print(" โŒ Confirmation failed. Removal cancelled.") return False - return self._delete_index_directory( - index_dir, index_name, project_path if not is_current else None - ) + if kind == "cli": + return self._delete_index_directory( + selected_match["index_dir"], + index_name, + project_path if not is_current else None, + is_app=False, + ) + else: + return self._delete_index_directory( + selected_match["files_dir"], + selected_match.get("display_name", index_name), + project_path if not is_current else None, + is_app=True, + meta_file=selected_match.get("meta_file"), + app_file_base=selected_match.get("file_base"), + ) else: print(" โŒ Invalid choice. Removal cancelled.") return False @@ -647,21 +791,65 @@ Examples: return False def _delete_index_directory( - self, index_dir: Path, index_name: str, project_path: Optional[Path] = None + self, + index_dir: Path, + index_display_name: str, + project_path: Optional[Path] = None, + is_app: bool = False, + meta_file: Optional[Path] = None, + app_file_base: Optional[str] = None, ): - """Actually delete the index directory""" + """Delete a CLI index directory or APP index files safely.""" try: - import shutil + if is_app: + removed = 0 + errors = 0 + # Delete only files that belong to this app index (based on file base) + pattern_base = app_file_base or "" + for f in index_dir.glob(f"{pattern_base}.leann*"): + try: + f.unlink() + removed += 1 + except Exception: + errors += 1 + # Best-effort: also remove the meta file if specified and still exists + if meta_file and meta_file.exists(): + try: + meta_file.unlink() + removed += 1 + except Exception: + errors += 1 - shutil.rmtree(index_dir) - - if project_path: - print(f"โœ… Index '{index_name}' removed from {project_path.name}") + if removed > 0 and errors == 0: + if project_path: + print( + f"โœ… App index '{index_display_name}' removed from {project_path.name}" + ) + else: + print(f"โœ… App index '{index_display_name}' removed successfully") + return True + elif removed > 0 and errors > 0: + print( + f"โš ๏ธ App index '{index_display_name}' partially removed (some files couldn't be deleted)" + ) + return True + else: + print( + f"โŒ No files found to remove for app index '{index_display_name}' in {index_dir}" + ) + return False else: - print(f"โœ… Index '{index_name}' removed successfully") - return True + import shutil + + shutil.rmtree(index_dir) + + if project_path: + print(f"โœ… Index '{index_display_name}' removed from {project_path.name}") + else: + print(f"โœ… Index '{index_display_name}' removed successfully") + return True except Exception as e: - print(f"โŒ Error removing index '{index_name}': {e}") + print(f"โŒ Error removing index '{index_display_name}': {e}") return False def load_documents( @@ -669,6 +857,7 @@ Examples: docs_paths: Union[str, list], custom_file_types: Union[str, None] = None, include_hidden: bool = False, + args: Optional[dict[str, Any]] = None, ): # Handle both single path (string) and multiple paths (list) for backward compatibility if isinstance(docs_paths, str): @@ -974,18 +1163,50 @@ Examples: } print("start chunking documents") - # Add progress bar for document chunking - for doc in tqdm(documents, desc="Chunking documents", unit="doc"): - # Check if this is a code file based on source path - source_path = doc.metadata.get("source", "") - is_code_file = any(source_path.endswith(ext) for ext in code_file_exts) - # Use appropriate parser based on file type - parser = self.code_parser if is_code_file else self.node_parser - nodes = parser.get_nodes_from_documents([doc]) + # Check if AST chunking is requested + use_ast = getattr(args, "use_ast_chunking", False) - for node in nodes: - all_texts.append(node.get_content()) + if use_ast: + print("๐Ÿง  Using AST-aware chunking for code files") + try: + # Import enhanced chunking utilities + # Add apps directory to path to import chunking utilities + apps_dir = Path(__file__).parent.parent.parent.parent.parent / "apps" + if apps_dir.exists(): + sys.path.insert(0, str(apps_dir)) + + from chunking import create_text_chunks + + # Use enhanced chunking with AST support + all_texts = create_text_chunks( + documents, + chunk_size=self.node_parser.chunk_size, + chunk_overlap=self.node_parser.chunk_overlap, + use_ast_chunking=True, + ast_chunk_size=getattr(args, "ast_chunk_size", 768), + ast_chunk_overlap=getattr(args, "ast_chunk_overlap", 96), + code_file_extensions=None, # Use defaults + ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True), + ) + + except ImportError as e: + print(f"โš ๏ธ AST chunking not available ({e}), falling back to traditional chunking") + use_ast = False + + if not use_ast: + # Use traditional chunking logic + for doc in tqdm(documents, desc="Chunking documents", unit="doc"): + # Check if this is a code file based on source path + source_path = doc.metadata.get("source", "") + is_code_file = any(source_path.endswith(ext) for ext in code_file_exts) + + # Use appropriate parser based on file type + parser = self.code_parser if is_code_file else self.node_parser + nodes = parser.get_nodes_from_documents([doc]) + + for node in nodes: + all_texts.append(node.get_content()) print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks") return all_texts @@ -1052,7 +1273,7 @@ Examples: ) all_texts = self.load_documents( - docs_paths, args.file_types, include_hidden=args.include_hidden + docs_paths, args.file_types, include_hidden=args.include_hidden, args=args ) if not all_texts: print("No documents found") @@ -1085,13 +1306,101 @@ Examples: async def search_documents(self, args): index_name = args.index_name query = args.query - index_path = self.get_index_path(index_name) - if not self.index_exists(index_name): - print( - f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it." - ) - return + # First try to find the index in current project + index_path = self.get_index_path(index_name) + if self.index_exists(index_name): + # Found in current project, use it + pass + else: + # Search across all registered projects (like list_indexes does) + all_matches = self._find_all_matching_indexes(index_name) + if not all_matches: + print( + f"Index '{index_name}' not found. Use 'leann build {index_name} --docs [ ...]' to create it." + ) + return + elif len(all_matches) == 1: + # Found exactly one match, use it + match = all_matches[0] + if match["kind"] == "cli": + index_path = str(match["index_dir"] / "documents.leann") + else: + # App format: use the meta file to construct the path + meta_file = match["meta_file"] + file_base = match["file_base"] + index_path = str(meta_file.parent / f"{file_base}.leann") + + project_info = ( + "current project" + if match["is_current"] + else f"project '{match['project_path'].name}'" + ) + print(f"Using index '{index_name}' from {project_info}") + else: + # Multiple matches found + if args.non_interactive: + # Non-interactive mode: automatically select the best match + # Priority: current project first, then first available + current_matches = [m for m in all_matches if m["is_current"]] + if current_matches: + match = current_matches[0] + location_desc = "current project" + else: + match = all_matches[0] + location_desc = f"project '{match['project_path'].name}'" + + if match["kind"] == "cli": + index_path = str(match["index_dir"] / "documents.leann") + else: + meta_file = match["meta_file"] + file_base = match["file_base"] + index_path = str(meta_file.parent / f"{file_base}.leann") + + print( + f"Found {len(all_matches)} indexes named '{index_name}', using index from {location_desc}" + ) + else: + # Interactive mode: ask user to choose + print(f"Found {len(all_matches)} indexes named '{index_name}':") + for i, match in enumerate(all_matches, 1): + project_path = match["project_path"] + is_current = match["is_current"] + kind = match.get("kind", "cli") + + if is_current: + print( + f" {i}. ๐Ÿ  Current project ({'CLI' if kind == 'cli' else 'APP'})" + ) + else: + print( + f" {i}. ๐Ÿ“‚ {project_path.name} ({'CLI' if kind == 'cli' else 'APP'})" + ) + + try: + choice = input(f"Which index to search? (1-{len(all_matches)}): ").strip() + choice_idx = int(choice) - 1 + if 0 <= choice_idx < len(all_matches): + match = all_matches[choice_idx] + if match["kind"] == "cli": + index_path = str(match["index_dir"] / "documents.leann") + else: + meta_file = match["meta_file"] + file_base = match["file_base"] + index_path = str(meta_file.parent / f"{file_base}.leann") + + project_info = ( + "current project" + if match["is_current"] + else f"project '{match['project_path'].name}'" + ) + print(f"Using index '{index_name}' from {project_info}") + else: + print("Invalid choice. Aborting search.") + return + except (ValueError, KeyboardInterrupt): + print("Invalid input. Aborting search.") + return searcher = LeannSearcher(index_path=index_path) results = searcher.search( diff --git a/packages/leann-core/src/leann/embedding_server_manager.py b/packages/leann-core/src/leann/embedding_server_manager.py index 05c8639..3d7c31e 100644 --- a/packages/leann-core/src/leann/embedding_server_manager.py +++ b/packages/leann-core/src/leann/embedding_server_manager.py @@ -192,6 +192,7 @@ class EmbeddingServerManager: stderr_target = None # Direct to console for visible logs # Start embedding server subprocess + logger.info(f"Starting server process with command: {' '.join(command)}") self.server_process = subprocess.Popen( command, cwd=project_root, diff --git a/packages/leann-core/src/leann/mcp.py b/packages/leann-core/src/leann/mcp.py index a413883..d057788 100755 --- a/packages/leann-core/src/leann/mcp.py +++ b/packages/leann-core/src/leann/mcp.py @@ -94,7 +94,7 @@ def handle_request(request): }, } - # Build simplified command + # Build simplified command with non-interactive flag for MCP compatibility cmd = [ "leann", "search", @@ -102,6 +102,7 @@ def handle_request(request): args["query"], f"--top-k={args.get('top_k', 5)}", f"--complexity={args.get('complexity', 32)}", + "--non-interactive", ] result = subprocess.run(cmd, capture_output=True, text=True) diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index 5013060..52d0bc9 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann" -version = "0.3.0" +version = "0.3.2" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" readme = "README.md" requires-python = ">=3.9" diff --git a/pyproject.toml b/pyproject.toml index 3267332..d738017 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,13 @@ dependencies = [ "pathspec>=0.12.1", "nbconvert>=7.16.6", "gitignore-parser>=0.1.12", + # AST-aware code chunking dependencies + "astchunk>=0.1.0", + "tree-sitter>=0.20.0", + "tree-sitter-python>=0.20.0", + "tree-sitter-java>=0.20.0", + "tree-sitter-c-sharp>=0.20.0", + "tree-sitter-typescript>=0.20.0", ] [project.optional-dependencies] diff --git a/tests/test_astchunk_integration.py b/tests/test_astchunk_integration.py new file mode 100644 index 0000000..df34521 --- /dev/null +++ b/tests/test_astchunk_integration.py @@ -0,0 +1,397 @@ +""" +Test suite for astchunk integration with LEANN. +Tests AST-aware chunking functionality, language detection, and fallback mechanisms. +""" + +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest + +# Add apps directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "apps")) + +from typing import Optional + +from chunking import ( + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, +) + + +class MockDocument: + """Mock LlamaIndex Document for testing.""" + + def __init__(self, content: str, file_path: str = "", metadata: Optional[dict] = None): + self.content = content + self.metadata = metadata or {} + if file_path: + self.metadata["file_path"] = file_path + + def get_content(self) -> str: + return self.content + + +class TestCodeFileDetection: + """Test code file detection and language mapping.""" + + def test_detect_code_files_python(self): + """Test detection of Python files.""" + docs = [ + MockDocument("print('hello')", "/path/to/file.py"), + MockDocument("This is text", "/path/to/file.txt"), + ] + + code_docs, text_docs = detect_code_files(docs) + + assert len(code_docs) == 1 + assert len(text_docs) == 1 + assert code_docs[0].metadata["language"] == "python" + assert code_docs[0].metadata["is_code"] is True + assert text_docs[0].metadata["is_code"] is False + + def test_detect_code_files_multiple_languages(self): + """Test detection of multiple programming languages.""" + docs = [ + MockDocument("def func():", "/path/to/script.py"), + MockDocument("public class Test {}", "/path/to/Test.java"), + MockDocument("interface ITest {}", "/path/to/test.ts"), + MockDocument("using System;", "/path/to/Program.cs"), + MockDocument("Regular text content", "/path/to/document.txt"), + ] + + code_docs, text_docs = detect_code_files(docs) + + assert len(code_docs) == 4 + assert len(text_docs) == 1 + + languages = [doc.metadata["language"] for doc in code_docs] + assert "python" in languages + assert "java" in languages + assert "typescript" in languages + assert "csharp" in languages + + def test_detect_code_files_no_file_path(self): + """Test handling of documents without file paths.""" + docs = [ + MockDocument("some content"), + MockDocument("other content", metadata={"some_key": "value"}), + ] + + code_docs, text_docs = detect_code_files(docs) + + assert len(code_docs) == 0 + assert len(text_docs) == 2 + for doc in text_docs: + assert doc.metadata["is_code"] is False + + def test_get_language_from_extension(self): + """Test language detection from file extensions.""" + assert get_language_from_extension("test.py") == "python" + assert get_language_from_extension("Test.java") == "java" + assert get_language_from_extension("component.tsx") == "typescript" + assert get_language_from_extension("Program.cs") == "csharp" + assert get_language_from_extension("document.txt") is None + assert get_language_from_extension("") is None + + +class TestChunkingFunctions: + """Test various chunking functionality.""" + + def test_create_traditional_chunks(self): + """Test traditional text chunking.""" + docs = [ + MockDocument( + "This is a test document. It has multiple sentences. We want to test chunking." + ) + ] + + chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10) + + assert len(chunks) > 0 + assert all(isinstance(chunk, str) for chunk in chunks) + assert all(len(chunk.strip()) > 0 for chunk in chunks) + + def test_create_traditional_chunks_empty_docs(self): + """Test traditional chunking with empty documents.""" + chunks = create_traditional_chunks([], chunk_size=50, chunk_overlap=10) + assert chunks == [] + + @pytest.mark.skipif( + os.environ.get("CI") == "true", + reason="Skip astchunk tests in CI - dependency may not be available", + ) + def test_create_ast_chunks_with_astchunk_available(self): + """Test AST chunking when astchunk is available.""" + python_code = ''' +def hello_world(): + """Print hello world message.""" + print("Hello, World!") + +def add_numbers(a, b): + """Add two numbers and return the result.""" + return a + b + +class Calculator: + """A simple calculator class.""" + + def __init__(self): + self.history = [] + + def add(self, a, b): + result = a + b + self.history.append(f"{a} + {b} = {result}") + return result +''' + + docs = [MockDocument(python_code, "/test/calculator.py", {"language": "python"})] + + try: + chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50) + + # Should have multiple chunks due to different functions/classes + assert len(chunks) > 0 + assert all(isinstance(chunk, str) for chunk in chunks) + assert all(len(chunk.strip()) > 0 for chunk in chunks) + + # Check that code structure is somewhat preserved + combined_content = " ".join(chunks) + assert "def hello_world" in combined_content + assert "class Calculator" in combined_content + + except ImportError: + # astchunk not available, should fall back to traditional chunking + chunks = create_ast_chunks(docs, max_chunk_size=200, chunk_overlap=50) + assert len(chunks) > 0 # Should still get chunks from fallback + + def test_create_ast_chunks_fallback_to_traditional(self): + """Test AST chunking falls back to traditional when astchunk is not available.""" + docs = [MockDocument("def test(): pass", "/test/script.py", {"language": "python"})] + + # Mock astchunk import to fail + with patch("chunking.create_ast_chunks"): + # First call (actual test) should import astchunk and potentially fail + # Let's call the actual function to test the import error handling + chunks = create_ast_chunks(docs) + + # Should return some chunks (either from astchunk or fallback) + assert isinstance(chunks, list) + + def test_create_text_chunks_traditional_mode(self): + """Test text chunking in traditional mode.""" + docs = [ + MockDocument("def test(): pass", "/test/script.py"), + MockDocument("This is regular text.", "/test/doc.txt"), + ] + + chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10) + + assert len(chunks) > 0 + assert all(isinstance(chunk, str) for chunk in chunks) + + def test_create_text_chunks_ast_mode(self): + """Test text chunking in AST mode.""" + docs = [ + MockDocument("def test(): pass", "/test/script.py"), + MockDocument("This is regular text.", "/test/doc.txt"), + ] + + chunks = create_text_chunks( + docs, + use_ast_chunking=True, + ast_chunk_size=100, + ast_chunk_overlap=20, + chunk_size=50, + chunk_overlap=10, + ) + + assert len(chunks) > 0 + assert all(isinstance(chunk, str) for chunk in chunks) + + def test_create_text_chunks_custom_extensions(self): + """Test text chunking with custom code file extensions.""" + docs = [ + MockDocument("function test() {}", "/test/script.js"), # Not in default extensions + MockDocument("Regular text", "/test/doc.txt"), + ] + + # First without custom extensions - should treat .js as text + chunks_without = create_text_chunks(docs, use_ast_chunking=True, code_file_extensions=None) + + # Then with custom extensions - should treat .js as code + chunks_with = create_text_chunks( + docs, use_ast_chunking=True, code_file_extensions=[".js", ".jsx"] + ) + + # Both should return chunks + assert len(chunks_without) > 0 + assert len(chunks_with) > 0 + + +class TestIntegrationWithDocumentRAG: + """Integration tests with the document RAG system.""" + + @pytest.fixture + def temp_code_dir(self): + """Create a temporary directory with sample code files.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create sample Python file + python_file = temp_path / "example.py" + python_file.write_text(''' +def fibonacci(n): + """Calculate fibonacci number.""" + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + +class MathUtils: + @staticmethod + def factorial(n): + if n <= 1: + return 1 + return n * MathUtils.factorial(n-1) +''') + + # Create sample text file + text_file = temp_path / "readme.txt" + text_file.write_text("This is a sample text file for testing purposes.") + + yield temp_path + + @pytest.mark.skipif( + os.environ.get("CI") == "true", + reason="Skip integration tests in CI to avoid dependency issues", + ) + def test_document_rag_with_ast_chunking(self, temp_code_dir): + """Test document RAG with AST chunking enabled.""" + with tempfile.TemporaryDirectory() as index_dir: + cmd = [ + sys.executable, + "apps/document_rag.py", + "--llm", + "simulated", + "--embedding-model", + "facebook/contriever", + "--embedding-mode", + "sentence-transformers", + "--index-dir", + index_dir, + "--data-dir", + str(temp_code_dir), + "--enable-code-chunking", + "--query", + "How does the fibonacci function work?", + ] + + env = os.environ.copy() + env["HF_HUB_DISABLE_SYMLINKS"] = "1" + env["TOKENIZERS_PARALLELISM"] = "false" + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minutes + env=env, + ) + + # Should succeed even if astchunk is not available (fallback) + assert result.returncode == 0, f"Command failed: {result.stderr}" + + output = result.stdout + result.stderr + assert "Index saved to" in output or "Using existing index" in output + + except subprocess.TimeoutExpired: + pytest.skip("Test timed out - likely due to model download in CI") + + @pytest.mark.skipif( + os.environ.get("CI") == "true", + reason="Skip integration tests in CI to avoid dependency issues", + ) + def test_code_rag_application(self, temp_code_dir): + """Test the specialized code RAG application.""" + with tempfile.TemporaryDirectory() as index_dir: + cmd = [ + sys.executable, + "apps/code_rag.py", + "--llm", + "simulated", + "--embedding-model", + "facebook/contriever", + "--index-dir", + index_dir, + "--repo-dir", + str(temp_code_dir), + "--query", + "What classes are defined in this code?", + ] + + env = os.environ.copy() + env["HF_HUB_DISABLE_SYMLINKS"] = "1" + env["TOKENIZERS_PARALLELISM"] = "false" + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env) + + # Should succeed + assert result.returncode == 0, f"Command failed: {result.stderr}" + + output = result.stdout + result.stderr + assert "Using AST-aware chunking" in output or "traditional chunking" in output + + except subprocess.TimeoutExpired: + pytest.skip("Test timed out - likely due to model download in CI") + + +class TestErrorHandling: + """Test error handling and edge cases.""" + + def test_text_chunking_empty_documents(self): + """Test text chunking with empty document list.""" + chunks = create_text_chunks([]) + assert chunks == [] + + def test_text_chunking_invalid_parameters(self): + """Test text chunking with invalid parameters.""" + docs = [MockDocument("test content")] + + # Should handle negative chunk sizes gracefully + chunks = create_text_chunks( + docs, chunk_size=0, chunk_overlap=0, ast_chunk_size=0, ast_chunk_overlap=0 + ) + + # Should still return some result + assert isinstance(chunks, list) + + def test_create_ast_chunks_no_language(self): + """Test AST chunking with documents missing language metadata.""" + docs = [MockDocument("def test(): pass", "/test/script.py")] # No language set + + chunks = create_ast_chunks(docs) + + # Should fall back to traditional chunking + assert isinstance(chunks, list) + assert len(chunks) >= 0 # May be empty if fallback also fails + + def test_create_ast_chunks_empty_content(self): + """Test AST chunking with empty content.""" + docs = [MockDocument("", "/test/script.py", {"language": "python"})] + + chunks = create_ast_chunks(docs) + + # Should handle empty content gracefully + assert isinstance(chunks, list) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_document_rag.py b/tests/test_document_rag.py index ddd0bfd..091767a 100644 --- a/tests/test_document_rag.py +++ b/tests/test_document_rag.py @@ -57,6 +57,51 @@ def test_document_rag_simulated(test_data_dir): assert "This is a simulated answer" in output +@pytest.mark.skipif( + os.environ.get("CI") == "true", + reason="Skip AST chunking tests in CI to avoid dependency issues", +) +def test_document_rag_with_ast_chunking(test_data_dir): + """Test document_rag with AST-aware chunking enabled.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Use a subdirectory that doesn't exist yet to force index creation + index_dir = Path(temp_dir) / "test_ast_index" + cmd = [ + sys.executable, + "apps/document_rag.py", + "--llm", + "simulated", + "--embedding-model", + "facebook/contriever", + "--embedding-mode", + "sentence-transformers", + "--index-dir", + str(index_dir), + "--data-dir", + str(test_data_dir), + "--enable-code-chunking", # Enable AST chunking + "--query", + "What is Pride and Prejudice about?", + ] + + env = os.environ.copy() + env["HF_HUB_DISABLE_SYMLINKS"] = "1" + env["TOKENIZERS_PARALLELISM"] = "false" + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env) + + # Check return code + assert result.returncode == 0, f"Command failed: {result.stderr}" + + # Verify output + output = result.stdout + result.stderr + assert "Index saved to" in output or "Using existing index" in output + assert "This is a simulated answer" in output + + # Should mention AST chunking if code files are present + # (might not be relevant for the test data, but command should succeed) + + @pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available") @pytest.mark.skipif( os.environ.get("CI") == "true", reason="Skip OpenAI tests in CI to avoid API costs" diff --git a/uv.lock b/uv.lock index 38db529..f36903b 100644 --- a/uv.lock +++ b/uv.lock @@ -195,7 +195,7 @@ version = "0.1.4" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, + { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321 }, ] [[package]] @@ -2058,7 +2058,7 @@ wheels = [ [[package]] name = "leann-backend-diskann" -version = "0.3.0" +version = "0.2.9" source = { editable = "packages/leann-backend-diskann" } dependencies = [ { name = "leann-core" }, @@ -2070,14 +2070,14 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.0" }, + { name = "leann-core", specifier = "==0.2.9" }, { name = "numpy" }, { name = "protobuf", specifier = ">=3.19.0" }, ] [[package]] name = "leann-backend-hnsw" -version = "0.3.0" +version = "0.2.9" source = { editable = "packages/leann-backend-hnsw" } dependencies = [ { name = "leann-core" }, @@ -2090,7 +2090,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.0" }, + { name = "leann-core", specifier = "==0.2.9" }, { name = "msgpack", specifier = ">=1.0.0" }, { name = "numpy" }, { name = "pyzmq", specifier = ">=23.0.0" }, @@ -2098,7 +2098,7 @@ requires-dist = [ [[package]] name = "leann-core" -version = "0.3.0" +version = "0.2.9" source = { editable = "packages/leann-core" } dependencies = [ { name = "accelerate" }, @@ -2164,6 +2164,7 @@ name = "leann-workspace" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "astchunk" }, { name = "boto3" }, { name = "colorama" }, { name = "datasets" }, @@ -2198,7 +2199,6 @@ dependencies = [ { name = "sglang" }, { name = "torch" }, { name = "tqdm" }, - { name = "typer" }, ] [package.optional-dependencies] @@ -2231,6 +2231,7 @@ test = [ [package.metadata] requires-dist = [ + { name = "astchunk", specifier = ">=0.1.0" }, { name = "beautifulsoup4", marker = "extra == 'documents'", specifier = ">=4.13.0" }, { name = "black", marker = "extra == 'dev'", specifier = ">=23.0" }, { name = "boto3" }, @@ -2280,7 +2281,6 @@ requires-dist = [ { name = "sglang" }, { name = "torch" }, { name = "tqdm" }, - { name = "typer", specifier = ">=0.12.3" }, ] provides-extras = ["dev", "test", "diskann", "documents"] @@ -4427,18 +4427,18 @@ version = "4.30.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, - { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, - { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, - { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, - { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, - { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, - { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, - { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, - { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, - { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, - { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, - { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254 }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624 }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126 }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077 }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431 }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008 }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543 }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911 }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430 }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951 }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098 }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118 }, ] [[package]] @@ -5824,7 +5824,7 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/fb/4b/3341d2fade52634d877476f4ed5fa8f7bf3f1e867bfba76f0fb341e2885f/transformers-4.54.0.tar.gz", hash = "sha256:843da4d66a573cef3d1b2e7a1d767e77da054621e69d9f3faff761e55a1f8203", size = 9510412, upload-time = "2025-07-25T18:58:20.826Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/34/4d82dc596764de9d14285f8ed53b50896bf05fbbcd71a82c6d174b3ab8c7/transformers-4.54.0-py3-none-any.whl", hash = "sha256:c96e607f848625965b76c677b2c2576f2c7b7097c1c5292b281919d90675a25e", size = 11176597, upload-time = "2025-07-25T18:58:17.677Z" }, + { url = "https://files.pythonhosted.org/packages/cc/34/4d82dc596764de9d14285f8ed53b50896bf05fbbcd71a82c6d174b3ab8c7/transformers-4.54.0-py3-none-any.whl", hash = "sha256:c96e607f848625965b76c677b2c2576f2c7b7097c1c5292b281919d90675a25e", size = 11176597 }, ] [[package]] @@ -5835,28 +5835,12 @@ dependencies = [ { name = "setuptools" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" }, - { url = "https://files.pythonhosted.org/packages/21/2f/3e56ea7b58f80ff68899b1dbe810ff257c9d177d288c6b0f55bf2fe4eb50/triton-3.3.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b31e3aa26f8cb3cc5bf4e187bf737cbacf17311e1112b781d4a059353dfd731b", size = 155689937, upload-time = "2025-05-29T23:39:44.182Z" }, - { url = "https://files.pythonhosted.org/packages/24/5f/950fb373bf9c01ad4eb5a8cd5eaf32cdf9e238c02f9293557a2129b9c4ac/triton-3.3.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9999e83aba21e1a78c1f36f21bce621b77bcaa530277a50484a7cb4a822f6e43", size = 155669138, upload-time = "2025-05-29T23:39:51.771Z" }, - { url = "https://files.pythonhosted.org/packages/74/1f/dfb531f90a2d367d914adfee771babbd3f1a5b26c3f5fbc458dee21daa78/triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240", size = 155673035, upload-time = "2025-05-29T23:40:02.468Z" }, - { url = "https://files.pythonhosted.org/packages/28/71/bd20ffcb7a64c753dc2463489a61bf69d531f308e390ad06390268c4ea04/triton-3.3.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3198adb9d78b77818a5388bff89fa72ff36f9da0bc689db2f0a651a67ce6a42", size = 155735832, upload-time = "2025-05-29T23:40:10.522Z" }, - { url = "https://files.pythonhosted.org/packages/6d/81/ac4d50af22f594c4cb7c84fd2ad5ba1e0c03e2a83fe3483ddd79edcd7ec7/triton-3.3.1-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f6139aeb04a146b0b8e0fbbd89ad1e65861c57cfed881f21d62d3cb94a36bab7", size = 155596799, upload-time = "2025-05-29T23:40:18.949Z" }, -] - -[[package]] -name = "typer" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "rich" }, - { name = "shellingham" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c5/8c/7d682431efca5fd290017663ea4588bf6f2c6aad085c7f108c5dbc316e70/typer-0.16.0.tar.gz", hash = "sha256:af377ffaee1dbe37ae9440cb4e8f11686ea5ce4e9bae01b84ae7c63b87f1dd3b", size = 102625, upload-time = "2025-05-26T14:30:31.824Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/42/3efaf858001d2c2913de7f354563e3a3a2f0decae3efe98427125a8f441e/typer-0.16.0-py3-none-any.whl", hash = "sha256:1f79bed11d4d02d4310e3c1b7ba594183bcedb0ac73b27a9e5f28f6fb5b98855", size = 46317, upload-time = "2025-05-26T14:30:30.523Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257 }, + { url = "https://files.pythonhosted.org/packages/21/2f/3e56ea7b58f80ff68899b1dbe810ff257c9d177d288c6b0f55bf2fe4eb50/triton-3.3.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b31e3aa26f8cb3cc5bf4e187bf737cbacf17311e1112b781d4a059353dfd731b", size = 155689937 }, + { url = "https://files.pythonhosted.org/packages/24/5f/950fb373bf9c01ad4eb5a8cd5eaf32cdf9e238c02f9293557a2129b9c4ac/triton-3.3.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9999e83aba21e1a78c1f36f21bce621b77bcaa530277a50484a7cb4a822f6e43", size = 155669138 }, + { url = "https://files.pythonhosted.org/packages/74/1f/dfb531f90a2d367d914adfee771babbd3f1a5b26c3f5fbc458dee21daa78/triton-3.3.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b89d846b5a4198317fec27a5d3a609ea96b6d557ff44b56c23176546023c4240", size = 155673035 }, + { url = "https://files.pythonhosted.org/packages/28/71/bd20ffcb7a64c753dc2463489a61bf69d531f308e390ad06390268c4ea04/triton-3.3.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3198adb9d78b77818a5388bff89fa72ff36f9da0bc689db2f0a651a67ce6a42", size = 155735832 }, + { url = "https://files.pythonhosted.org/packages/6d/81/ac4d50af22f594c4cb7c84fd2ad5ba1e0c03e2a83fe3483ddd79edcd7ec7/triton-3.3.1-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f6139aeb04a146b0b8e0fbbd89ad1e65861c57cfed881f21d62d3cb94a36bab7", size = 155596799 }, ] [[package]]