From e93c0dec6f671d96e815a6b4d213dcfa0aaae855 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Wed, 17 Sep 2025 18:44:00 -0700 Subject: [PATCH] [Fix] Enable AST chunking when installed (package chunking utils) (#101) * fix(core): package chunking utils for AST chunking; re-export in apps; CLI imports packaged utils * style * chore: fix ruff warnings (RUF059, F401) * style --- apps/chunking/__init__.py | 44 ++++-- apps/history_data/history.py | 2 +- .../leann-core/src/leann/chunking_utils.py | 140 +++--------------- packages/leann-core/src/leann/cli.py | 14 +- 4 files changed, 59 insertions(+), 141 deletions(-) rename apps/chunking/utils.py => packages/leann-core/src/leann/chunking_utils.py (56%) diff --git a/apps/chunking/__init__.py b/apps/chunking/__init__.py index 3cd5c0d..2f323e3 100644 --- a/apps/chunking/__init__.py +++ b/apps/chunking/__init__.py @@ -1,16 +1,38 @@ -""" -Chunking utilities for LEANN RAG applications. -Provides AST-aware and traditional text chunking functionality. +"""Unified chunking utilities facade. + +This module re-exports the packaged utilities from `leann.chunking_utils` so +that both repo apps (importing `chunking`) and installed wheels share one +single implementation. When running from the repo without installation, it +adds the `packages/leann-core/src` directory to `sys.path` as a fallback. """ -from .utils import ( - CODE_EXTENSIONS, - create_ast_chunks, - create_text_chunks, - create_traditional_chunks, - detect_code_files, - get_language_from_extension, -) +import sys +from pathlib import Path + +try: + from leann.chunking_utils import ( + CODE_EXTENSIONS, + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, + ) +except Exception: # pragma: no cover - best-effort fallback for dev environment + repo_root = Path(__file__).resolve().parents[2] + leann_src = repo_root / "packages" / "leann-core" / "src" + if leann_src.exists(): + sys.path.insert(0, str(leann_src)) + from leann.chunking_utils import ( + CODE_EXTENSIONS, + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, + ) + else: + raise __all__ = [ "CODE_EXTENSIONS", diff --git a/apps/history_data/history.py b/apps/history_data/history.py index bb2eac1..f7e23c7 100644 --- a/apps/history_data/history.py +++ b/apps/history_data/history.py @@ -74,7 +74,7 @@ class ChromeHistoryReader(BaseReader): if count >= max_count and max_count > 0: break - last_visit, url, title, visit_count, typed_count, hidden = row + last_visit, url, title, visit_count, typed_count, _hidden = row # Create document content with metadata embedded in text doc_content = f""" diff --git a/apps/chunking/utils.py b/packages/leann-core/src/leann/chunking_utils.py similarity index 56% rename from apps/chunking/utils.py rename to packages/leann-core/src/leann/chunking_utils.py index 9a19c63..db80a39 100644 --- a/apps/chunking/utils.py +++ b/packages/leann-core/src/leann/chunking_utils.py @@ -1,6 +1,6 @@ """ Enhanced chunking utilities with AST-aware code chunking support. -Provides unified interface for both traditional and AST-based text chunking. +Packaged within leann-core so installed wheels can import it reliably. """ import logging @@ -22,30 +22,9 @@ CODE_EXTENSIONS = { ".jsx": "typescript", } -# Default chunk parameters for different content types -DEFAULT_CHUNK_PARAMS = { - "code": { - "max_chunk_size": 512, - "chunk_overlap": 64, - }, - "text": { - "chunk_size": 256, - "chunk_overlap": 128, - }, -} - def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: - """ - Separate documents into code files and regular text files. - - Args: - documents: List of LlamaIndex Document objects - code_extensions: Dict mapping file extensions to languages (defaults to CODE_EXTENSIONS) - - Returns: - Tuple of (code_documents, text_documents) - """ + """Separate documents into code files and regular text files.""" if code_extensions is None: code_extensions = CODE_EXTENSIONS @@ -53,16 +32,10 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: text_docs = [] for doc in documents: - # Get file path from metadata - file_path = doc.metadata.get("file_path", "") - if not file_path: - # Fallback to file_name - file_path = doc.metadata.get("file_name", "") - + file_path = doc.metadata.get("file_path", "") or doc.metadata.get("file_name", "") if file_path: file_ext = Path(file_path).suffix.lower() if file_ext in code_extensions: - # Add language info to metadata doc.metadata["language"] = code_extensions[file_ext] doc.metadata["is_code"] = True code_docs.append(doc) @@ -70,7 +43,6 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: doc.metadata["is_code"] = False text_docs.append(doc) else: - # If no file path, treat as text doc.metadata["is_code"] = False text_docs.append(doc) @@ -79,7 +51,7 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: def get_language_from_extension(file_path: str) -> Optional[str]: - """Get the programming language from file extension.""" + """Return language string from a filename/extension using CODE_EXTENSIONS.""" ext = Path(file_path).suffix.lower() return CODE_EXTENSIONS.get(ext) @@ -90,40 +62,26 @@ def create_ast_chunks( chunk_overlap: int = 64, metadata_template: str = "default", ) -> list[str]: - """ - Create AST-aware chunks from code documents using astchunk. + """Create AST-aware chunks from code documents using astchunk. - Args: - documents: List of code documents - max_chunk_size: Maximum characters per chunk - chunk_overlap: Number of AST nodes to overlap between chunks - metadata_template: Template for chunk metadata - - Returns: - List of text chunks with preserved code structure + Falls back to traditional chunking if astchunk is unavailable. """ try: - from astchunk import ASTChunkBuilder + from astchunk import ASTChunkBuilder # optional dependency except ImportError as e: logger.error(f"astchunk not available: {e}") logger.info("Falling back to traditional chunking for code files") return create_traditional_chunks(documents, max_chunk_size, chunk_overlap) all_chunks = [] - for doc in documents: - # Get language from metadata (set by detect_code_files) language = doc.metadata.get("language") if not language: - logger.warning( - "No language detected for document, falling back to traditional chunking" - ) - traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) - all_chunks.extend(traditional_chunks) + logger.warning("No language detected; falling back to traditional chunking") + all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap)) continue try: - # Configure astchunk configs = { "max_chunk_size": max_chunk_size, "language": language, @@ -131,7 +89,6 @@ def create_ast_chunks( "chunk_overlap": chunk_overlap if chunk_overlap > 0 else 0, } - # Add repository-level metadata if available repo_metadata = { "file_path": doc.metadata.get("file_path", ""), "file_name": doc.metadata.get("file_name", ""), @@ -140,17 +97,13 @@ def create_ast_chunks( } configs["repo_level_metadata"] = repo_metadata - # Create chunk builder and process chunk_builder = ASTChunkBuilder(**configs) code_content = doc.get_content() - if not code_content or not code_content.strip(): logger.warning("Empty code content, skipping") continue chunks = chunk_builder.chunkify(code_content) - - # Extract text content from chunks for chunk in chunks: if hasattr(chunk, "text"): chunk_text = chunk.text @@ -159,7 +112,6 @@ def create_ast_chunks( elif isinstance(chunk, str): chunk_text = chunk else: - # Try to convert to string chunk_text = str(chunk) if chunk_text and chunk_text.strip(): @@ -168,12 +120,10 @@ def create_ast_chunks( logger.info( f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}" ) - except Exception as e: logger.warning(f"AST chunking failed for {language} file: {e}") logger.info("Falling back to traditional chunking") - traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) - all_chunks.extend(traditional_chunks) + all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap)) return all_chunks @@ -181,23 +131,10 @@ def create_ast_chunks( def create_traditional_chunks( documents, chunk_size: int = 256, chunk_overlap: int = 128 ) -> list[str]: - """ - Create traditional text chunks using LlamaIndex SentenceSplitter. - - Args: - documents: List of documents to chunk - chunk_size: Size of each chunk in characters - chunk_overlap: Overlap between chunks - - Returns: - List of text chunks - """ - # Handle invalid chunk_size values + """Create traditional text chunks using LlamaIndex SentenceSplitter.""" if chunk_size <= 0: logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256") chunk_size = 256 - - # Ensure chunk_overlap is not negative and not larger than chunk_size if chunk_overlap < 0: chunk_overlap = 0 if chunk_overlap >= chunk_size: @@ -215,12 +152,9 @@ def create_traditional_chunks( try: nodes = node_parser.get_nodes_from_documents([doc]) if nodes: - chunk_texts = [node.get_content() for node in nodes] - all_texts.extend(chunk_texts) - logger.debug(f"Created {len(chunk_texts)} traditional chunks from document") + all_texts.extend(node.get_content() for node in nodes) except Exception as e: logger.error(f"Traditional chunking failed for document: {e}") - # As last resort, add the raw content content = doc.get_content() if content and content.strip(): all_texts.append(content.strip()) @@ -238,32 +172,13 @@ def create_text_chunks( code_file_extensions: Optional[list[str]] = None, ast_fallback_traditional: bool = True, ) -> list[str]: - """ - Create text chunks from documents with optional AST support for code files. - - Args: - documents: List of LlamaIndex Document objects - chunk_size: Size for traditional text chunks - chunk_overlap: Overlap for traditional text chunks - use_ast_chunking: Whether to use AST chunking for code files - ast_chunk_size: Size for AST chunks - ast_chunk_overlap: Overlap for AST chunks - code_file_extensions: Custom list of code file extensions - ast_fallback_traditional: Fall back to traditional chunking on AST errors - - Returns: - List of text chunks - """ + """Create text chunks from documents with optional AST support for code files.""" if not documents: logger.warning("No documents provided for chunking") return [] - # Create a local copy of supported extensions for this function call local_code_extensions = CODE_EXTENSIONS.copy() - - # Update supported extensions if provided if code_file_extensions: - # Map extensions to languages (simplified mapping) ext_mapping = { ".py": "python", ".java": "java", @@ -273,47 +188,32 @@ def create_text_chunks( } for ext in code_file_extensions: if ext.lower() not in local_code_extensions: - # Try to guess language from extension if ext.lower() in ext_mapping: local_code_extensions[ext.lower()] = ext_mapping[ext.lower()] else: logger.warning(f"Unsupported extension {ext}, will use traditional chunking") all_chunks = [] - if use_ast_chunking: - # Separate code and text documents using local extensions code_docs, text_docs = detect_code_files(documents, local_code_extensions) - - # Process code files with AST chunking if code_docs: - logger.info(f"Processing {len(code_docs)} code files with AST chunking") try: - ast_chunks = create_ast_chunks( - code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap + all_chunks.extend( + create_ast_chunks( + code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap + ) ) - all_chunks.extend(ast_chunks) - logger.info(f"Created {len(ast_chunks)} AST chunks from code files") except Exception as e: logger.error(f"AST chunking failed: {e}") if ast_fallback_traditional: - logger.info("Falling back to traditional chunking for code files") - traditional_code_chunks = create_traditional_chunks( - code_docs, chunk_size, chunk_overlap + all_chunks.extend( + create_traditional_chunks(code_docs, chunk_size, chunk_overlap) ) - all_chunks.extend(traditional_code_chunks) else: raise - - # Process text files with traditional chunking if text_docs: - logger.info(f"Processing {len(text_docs)} text files with traditional chunking") - text_chunks = create_traditional_chunks(text_docs, chunk_size, chunk_overlap) - all_chunks.extend(text_chunks) - logger.info(f"Created {len(text_chunks)} traditional chunks from text files") + all_chunks.extend(create_traditional_chunks(text_docs, chunk_size, chunk_overlap)) else: - # Use traditional chunking for all files - logger.info(f"Processing {len(documents)} documents with traditional chunking") all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap) logger.info(f"Total chunks created: {len(all_chunks)}") diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index caad276..2d514e2 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1,6 +1,5 @@ import argparse import asyncio -import sys from pathlib import Path from typing import Any, Optional, Union @@ -1216,13 +1215,8 @@ Examples: if use_ast: print("🧠 Using AST-aware chunking for code files") try: - # Import enhanced chunking utilities - # Add apps directory to path to import chunking utilities - apps_dir = Path(__file__).parent.parent.parent.parent.parent / "apps" - if apps_dir.exists(): - sys.path.insert(0, str(apps_dir)) - - from chunking import create_text_chunks + # Import enhanced chunking utilities from packaged module + from .chunking_utils import create_text_chunks # Use enhanced chunking with AST support all_texts = create_text_chunks( @@ -1237,7 +1231,9 @@ Examples: ) except ImportError as e: - print(f"⚠️ AST chunking not available ({e}), falling back to traditional chunking") + print( + f"⚠️ AST chunking utilities not available in package ({e}), falling back to traditional chunking" + ) use_ast = False if not use_ast: