From e93c0dec6f671d96e815a6b4d213dcfa0aaae855 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Wed, 17 Sep 2025 18:44:00 -0700
Subject: [PATCH] [Fix] Enable AST chunking when installed (package chunking
 utils) (#101)

* fix(core): package chunking utils for AST chunking; re-export in apps; CLI imports packaged utils

* style

* chore: fix ruff warnings (RUF059, F401)

* style
---
 apps/chunking/__init__.py                     |  44 ++++--
 apps/history_data/history.py                  |   2 +-
 .../leann-core/src/leann/chunking_utils.py    | 140 +++---------------
 packages/leann-core/src/leann/cli.py          |  14 +-
 4 files changed, 59 insertions(+), 141 deletions(-)
 rename apps/chunking/utils.py => packages/leann-core/src/leann/chunking_utils.py (56%)

diff --git a/apps/chunking/__init__.py b/apps/chunking/__init__.py
index 3cd5c0d..2f323e3 100644
--- a/apps/chunking/__init__.py
+++ b/apps/chunking/__init__.py
@@ -1,16 +1,38 @@
-"""
-Chunking utilities for LEANN RAG applications.
-Provides AST-aware and traditional text chunking functionality.
+"""Unified chunking utilities facade.
+
+This module re-exports the packaged utilities from `leann.chunking_utils` so
+that both repo apps (importing `chunking`) and installed wheels share one
+single implementation. When running from the repo without installation, it
+adds the `packages/leann-core/src` directory to `sys.path` as a fallback.
 """
 
-from .utils import (
-    CODE_EXTENSIONS,
-    create_ast_chunks,
-    create_text_chunks,
-    create_traditional_chunks,
-    detect_code_files,
-    get_language_from_extension,
-)
+import sys
+from pathlib import Path
+
+try:
+    from leann.chunking_utils import (
+        CODE_EXTENSIONS,
+        create_ast_chunks,
+        create_text_chunks,
+        create_traditional_chunks,
+        detect_code_files,
+        get_language_from_extension,
+    )
+except Exception:  # pragma: no cover - best-effort fallback for dev environment
+    repo_root = Path(__file__).resolve().parents[2]
+    leann_src = repo_root / "packages" / "leann-core" / "src"
+    if leann_src.exists():
+        sys.path.insert(0, str(leann_src))
+        from leann.chunking_utils import (
+            CODE_EXTENSIONS,
+            create_ast_chunks,
+            create_text_chunks,
+            create_traditional_chunks,
+            detect_code_files,
+            get_language_from_extension,
+        )
+    else:
+        raise
 
 __all__ = [
     "CODE_EXTENSIONS",
diff --git a/apps/history_data/history.py b/apps/history_data/history.py
index bb2eac1..f7e23c7 100644
--- a/apps/history_data/history.py
+++ b/apps/history_data/history.py
@@ -74,7 +74,7 @@ class ChromeHistoryReader(BaseReader):
                 if count >= max_count and max_count > 0:
                     break
 
-                last_visit, url, title, visit_count, typed_count, hidden = row
+                last_visit, url, title, visit_count, typed_count, _hidden = row
 
                 # Create document content with metadata embedded in text
                 doc_content = f"""
diff --git a/apps/chunking/utils.py b/packages/leann-core/src/leann/chunking_utils.py
similarity index 56%
rename from apps/chunking/utils.py
rename to packages/leann-core/src/leann/chunking_utils.py
index 9a19c63..db80a39 100644
--- a/apps/chunking/utils.py
+++ b/packages/leann-core/src/leann/chunking_utils.py
@@ -1,6 +1,6 @@
 """
 Enhanced chunking utilities with AST-aware code chunking support.
-Provides unified interface for both traditional and AST-based text chunking.
+Packaged within leann-core so installed wheels can import it reliably.
 """
 
 import logging
@@ -22,30 +22,9 @@ CODE_EXTENSIONS = {
     ".jsx": "typescript",
 }
 
-# Default chunk parameters for different content types
-DEFAULT_CHUNK_PARAMS = {
-    "code": {
-        "max_chunk_size": 512,
-        "chunk_overlap": 64,
-    },
-    "text": {
-        "chunk_size": 256,
-        "chunk_overlap": 128,
-    },
-}
-
 
 def detect_code_files(documents, code_extensions=None) -> tuple[list, list]:
-    """
-    Separate documents into code files and regular text files.
-
-    Args:
-        documents: List of LlamaIndex Document objects
-        code_extensions: Dict mapping file extensions to languages (defaults to CODE_EXTENSIONS)
-
-    Returns:
-        Tuple of (code_documents, text_documents)
-    """
+    """Separate documents into code files and regular text files."""
     if code_extensions is None:
         code_extensions = CODE_EXTENSIONS
 
@@ -53,16 +32,10 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]:
     text_docs = []
 
     for doc in documents:
-        # Get file path from metadata
-        file_path = doc.metadata.get("file_path", "")
-        if not file_path:
-            # Fallback to file_name
-            file_path = doc.metadata.get("file_name", "")
-
+        file_path = doc.metadata.get("file_path", "") or doc.metadata.get("file_name", "")
         if file_path:
             file_ext = Path(file_path).suffix.lower()
             if file_ext in code_extensions:
-                # Add language info to metadata
                 doc.metadata["language"] = code_extensions[file_ext]
                 doc.metadata["is_code"] = True
                 code_docs.append(doc)
@@ -70,7 +43,6 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]:
                 doc.metadata["is_code"] = False
                 text_docs.append(doc)
         else:
-            # If no file path, treat as text
             doc.metadata["is_code"] = False
             text_docs.append(doc)
 
@@ -79,7 +51,7 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]:
 
 
 def get_language_from_extension(file_path: str) -> Optional[str]:
-    """Get the programming language from file extension."""
+    """Return language string from a filename/extension using CODE_EXTENSIONS."""
     ext = Path(file_path).suffix.lower()
     return CODE_EXTENSIONS.get(ext)
 
@@ -90,40 +62,26 @@ def create_ast_chunks(
     chunk_overlap: int = 64,
     metadata_template: str = "default",
 ) -> list[str]:
-    """
-    Create AST-aware chunks from code documents using astchunk.
+    """Create AST-aware chunks from code documents using astchunk.
 
-    Args:
-        documents: List of code documents
-        max_chunk_size: Maximum characters per chunk
-        chunk_overlap: Number of AST nodes to overlap between chunks
-        metadata_template: Template for chunk metadata
-
-    Returns:
-        List of text chunks with preserved code structure
+    Falls back to traditional chunking if astchunk is unavailable.
     """
     try:
-        from astchunk import ASTChunkBuilder
+        from astchunk import ASTChunkBuilder  # optional dependency
     except ImportError as e:
         logger.error(f"astchunk not available: {e}")
         logger.info("Falling back to traditional chunking for code files")
         return create_traditional_chunks(documents, max_chunk_size, chunk_overlap)
 
     all_chunks = []
-
     for doc in documents:
-        # Get language from metadata (set by detect_code_files)
         language = doc.metadata.get("language")
         if not language:
-            logger.warning(
-                "No language detected for document, falling back to traditional chunking"
-            )
-            traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap)
-            all_chunks.extend(traditional_chunks)
+            logger.warning("No language detected; falling back to traditional chunking")
+            all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
             continue
 
         try:
-            # Configure astchunk
             configs = {
                 "max_chunk_size": max_chunk_size,
                 "language": language,
@@ -131,7 +89,6 @@ def create_ast_chunks(
                 "chunk_overlap": chunk_overlap if chunk_overlap > 0 else 0,
             }
 
-            # Add repository-level metadata if available
             repo_metadata = {
                 "file_path": doc.metadata.get("file_path", ""),
                 "file_name": doc.metadata.get("file_name", ""),
@@ -140,17 +97,13 @@ def create_ast_chunks(
             }
             configs["repo_level_metadata"] = repo_metadata
 
-            # Create chunk builder and process
             chunk_builder = ASTChunkBuilder(**configs)
             code_content = doc.get_content()
-
             if not code_content or not code_content.strip():
                 logger.warning("Empty code content, skipping")
                 continue
 
             chunks = chunk_builder.chunkify(code_content)
-
-            # Extract text content from chunks
             for chunk in chunks:
                 if hasattr(chunk, "text"):
                     chunk_text = chunk.text
@@ -159,7 +112,6 @@ def create_ast_chunks(
                 elif isinstance(chunk, str):
                     chunk_text = chunk
                 else:
-                    # Try to convert to string
                     chunk_text = str(chunk)
 
                 if chunk_text and chunk_text.strip():
@@ -168,12 +120,10 @@ def create_ast_chunks(
             logger.info(
                 f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}"
             )
-
         except Exception as e:
             logger.warning(f"AST chunking failed for {language} file: {e}")
             logger.info("Falling back to traditional chunking")
-            traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap)
-            all_chunks.extend(traditional_chunks)
+            all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
 
     return all_chunks
 
@@ -181,23 +131,10 @@ def create_ast_chunks(
 def create_traditional_chunks(
     documents, chunk_size: int = 256, chunk_overlap: int = 128
 ) -> list[str]:
-    """
-    Create traditional text chunks using LlamaIndex SentenceSplitter.
-
-    Args:
-        documents: List of documents to chunk
-        chunk_size: Size of each chunk in characters
-        chunk_overlap: Overlap between chunks
-
-    Returns:
-        List of text chunks
-    """
-    # Handle invalid chunk_size values
+    """Create traditional text chunks using LlamaIndex SentenceSplitter."""
     if chunk_size <= 0:
         logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256")
         chunk_size = 256
-
-    # Ensure chunk_overlap is not negative and not larger than chunk_size
     if chunk_overlap < 0:
         chunk_overlap = 0
     if chunk_overlap >= chunk_size:
@@ -215,12 +152,9 @@ def create_traditional_chunks(
         try:
             nodes = node_parser.get_nodes_from_documents([doc])
             if nodes:
-                chunk_texts = [node.get_content() for node in nodes]
-                all_texts.extend(chunk_texts)
-                logger.debug(f"Created {len(chunk_texts)} traditional chunks from document")
+                all_texts.extend(node.get_content() for node in nodes)
         except Exception as e:
             logger.error(f"Traditional chunking failed for document: {e}")
-            # As last resort, add the raw content
             content = doc.get_content()
             if content and content.strip():
                 all_texts.append(content.strip())
@@ -238,32 +172,13 @@ def create_text_chunks(
     code_file_extensions: Optional[list[str]] = None,
     ast_fallback_traditional: bool = True,
 ) -> list[str]:
-    """
-    Create text chunks from documents with optional AST support for code files.
-
-    Args:
-        documents: List of LlamaIndex Document objects
-        chunk_size: Size for traditional text chunks
-        chunk_overlap: Overlap for traditional text chunks
-        use_ast_chunking: Whether to use AST chunking for code files
-        ast_chunk_size: Size for AST chunks
-        ast_chunk_overlap: Overlap for AST chunks
-        code_file_extensions: Custom list of code file extensions
-        ast_fallback_traditional: Fall back to traditional chunking on AST errors
-
-    Returns:
-        List of text chunks
-    """
+    """Create text chunks from documents with optional AST support for code files."""
     if not documents:
         logger.warning("No documents provided for chunking")
         return []
 
-    # Create a local copy of supported extensions for this function call
     local_code_extensions = CODE_EXTENSIONS.copy()
-
-    # Update supported extensions if provided
     if code_file_extensions:
-        # Map extensions to languages (simplified mapping)
         ext_mapping = {
             ".py": "python",
             ".java": "java",
@@ -273,47 +188,32 @@ def create_text_chunks(
         }
         for ext in code_file_extensions:
             if ext.lower() not in local_code_extensions:
-                # Try to guess language from extension
                 if ext.lower() in ext_mapping:
                     local_code_extensions[ext.lower()] = ext_mapping[ext.lower()]
                 else:
                     logger.warning(f"Unsupported extension {ext}, will use traditional chunking")
 
     all_chunks = []
-
     if use_ast_chunking:
-        # Separate code and text documents using local extensions
         code_docs, text_docs = detect_code_files(documents, local_code_extensions)
-
-        # Process code files with AST chunking
         if code_docs:
-            logger.info(f"Processing {len(code_docs)} code files with AST chunking")
             try:
-                ast_chunks = create_ast_chunks(
-                    code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap
+                all_chunks.extend(
+                    create_ast_chunks(
+                        code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap
+                    )
                 )
-                all_chunks.extend(ast_chunks)
-                logger.info(f"Created {len(ast_chunks)} AST chunks from code files")
             except Exception as e:
                 logger.error(f"AST chunking failed: {e}")
                 if ast_fallback_traditional:
-                    logger.info("Falling back to traditional chunking for code files")
-                    traditional_code_chunks = create_traditional_chunks(
-                        code_docs, chunk_size, chunk_overlap
+                    all_chunks.extend(
+                        create_traditional_chunks(code_docs, chunk_size, chunk_overlap)
                     )
-                    all_chunks.extend(traditional_code_chunks)
                 else:
                     raise
-
-        # Process text files with traditional chunking
         if text_docs:
-            logger.info(f"Processing {len(text_docs)} text files with traditional chunking")
-            text_chunks = create_traditional_chunks(text_docs, chunk_size, chunk_overlap)
-            all_chunks.extend(text_chunks)
-            logger.info(f"Created {len(text_chunks)} traditional chunks from text files")
+            all_chunks.extend(create_traditional_chunks(text_docs, chunk_size, chunk_overlap))
     else:
-        # Use traditional chunking for all files
-        logger.info(f"Processing {len(documents)} documents with traditional chunking")
         all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap)
 
     logger.info(f"Total chunks created: {len(all_chunks)}")
diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index caad276..2d514e2 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import sys
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -1216,13 +1215,8 @@ Examples:
         if use_ast:
             print("🧠 Using AST-aware chunking for code files")
             try:
-                # Import enhanced chunking utilities
-                # Add apps directory to path to import chunking utilities
-                apps_dir = Path(__file__).parent.parent.parent.parent.parent / "apps"
-                if apps_dir.exists():
-                    sys.path.insert(0, str(apps_dir))
-
-                from chunking import create_text_chunks
+                # Import enhanced chunking utilities from packaged module
+                from .chunking_utils import create_text_chunks
 
                 # Use enhanced chunking with AST support
                 all_texts = create_text_chunks(
@@ -1237,7 +1231,9 @@ Examples:
                 )
 
             except ImportError as e:
-                print(f"⚠️  AST chunking not available ({e}), falling back to traditional chunking")
+                print(
+                    f"⚠️  AST chunking utilities not available in package ({e}), falling back to traditional chunking"
+                )
                 use_ast = False
 
         if not use_ast: