diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..b18bb62 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,50 @@ +name: Bug Report +description: Report a bug in LEANN +labels: ["bug"] + +body: + - type: textarea + id: description + attributes: + label: What happened? + description: A clear description of the bug + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: How to reproduce + placeholder: | + 1. Install with... + 2. Run command... + 3. See error + validations: + required: true + + - type: textarea + id: error + attributes: + label: Error message + description: Paste any error messages + render: shell + + - type: input + id: version + attributes: + label: LEANN Version + placeholder: "0.1.0" + validations: + required: true + + - type: dropdown + id: os + attributes: + label: Operating System + options: + - macOS + - Linux + - Windows + - Docker + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..ce0f898 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: true +contact_links: + - name: Documentation + url: https://github.com/LEANN-RAG/LEANN-RAG/tree/main/docs + about: Read the docs first + - name: Discussions + url: https://github.com/LEANN-RAG/LEANN-RAG/discussions + about: Ask questions and share ideas diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..0b01b1d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,27 @@ +name: Feature Request +description: Suggest a new feature for LEANN +labels: ["enhancement"] + +body: + - type: textarea + id: problem + attributes: + label: What problem does this solve? + description: Describe the problem or need + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed solution + description: How would you like this to work? + validations: + required: true + + - type: textarea + id: example + attributes: + label: Example usage + description: Show how the API might look + render: python diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..e482f32 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,13 @@ +## What does this PR do? + + + +## Related Issues + +Fixes # + +## Checklist + +- [ ] Tests pass (`uv run pytest`) +- [ ] Code formatted (`ruff format` and `ruff check`) +- [ ] Pre-commit hooks pass (`pre-commit run --all-files`) diff --git a/.gitmodules b/.gitmodules index aa2e98e..359164c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,5 +16,4 @@ url = https://github.com/zeromq/libzmq.git [submodule "packages/astchunk-leann"] path = packages/astchunk-leann - url = git@github.com:yichuan-w/astchunk-leann.git - branch = main + url = https://github.com/yichuan-w/astchunk-leann.git diff --git a/apps/chunking/__init__.py b/apps/chunking/__init__.py index 3cd5c0d..2f323e3 100644 --- a/apps/chunking/__init__.py +++ b/apps/chunking/__init__.py @@ -1,16 +1,38 @@ -""" -Chunking utilities for LEANN RAG applications. -Provides AST-aware and traditional text chunking functionality. +"""Unified chunking utilities facade. + +This module re-exports the packaged utilities from `leann.chunking_utils` so +that both repo apps (importing `chunking`) and installed wheels share one +single implementation. When running from the repo without installation, it +adds the `packages/leann-core/src` directory to `sys.path` as a fallback. """ -from .utils import ( - CODE_EXTENSIONS, - create_ast_chunks, - create_text_chunks, - create_traditional_chunks, - detect_code_files, - get_language_from_extension, -) +import sys +from pathlib import Path + +try: + from leann.chunking_utils import ( + CODE_EXTENSIONS, + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, + ) +except Exception: # pragma: no cover - best-effort fallback for dev environment + repo_root = Path(__file__).resolve().parents[2] + leann_src = repo_root / "packages" / "leann-core" / "src" + if leann_src.exists(): + sys.path.insert(0, str(leann_src)) + from leann.chunking_utils import ( + CODE_EXTENSIONS, + create_ast_chunks, + create_text_chunks, + create_traditional_chunks, + detect_code_files, + get_language_from_extension, + ) + else: + raise __all__ = [ "CODE_EXTENSIONS", diff --git a/apps/history_data/history.py b/apps/history_data/history.py index 2af29e8..f7e23c7 100644 --- a/apps/history_data/history.py +++ b/apps/history_data/history.py @@ -74,7 +74,7 @@ class ChromeHistoryReader(BaseReader): if count >= max_count and max_count > 0: break - last_visit, url, title, visit_count, typed_count, _ = row + last_visit, url, title, visit_count, typed_count, _hidden = row # Create document content with metadata embedded in text doc_content = f""" diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml index 07be0ac..51c31d6 100644 --- a/packages/leann-backend-diskann/pyproject.toml +++ b/packages/leann-backend-diskann/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-diskann" -version = "0.3.3" -dependencies = ["leann-core==0.3.3", "numpy", "protobuf>=3.19.0"] +version = "0.3.4" +dependencies = ["leann-core==0.3.4", "numpy", "protobuf>=3.19.0"] [tool.scikit-build] # Key: simplified CMake path diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml index 3456ac8..2bfa307 100644 --- a/packages/leann-backend-hnsw/pyproject.toml +++ b/packages/leann-backend-hnsw/pyproject.toml @@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-hnsw" -version = "0.3.3" +version = "0.3.4" description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." dependencies = [ - "leann-core==0.3.3", + "leann-core==0.3.4", "numpy", "pyzmq>=23.0.0", "msgpack>=1.0.0", diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index 82a65d9..71f6bb7 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann-core" -version = "0.3.3" +version = "0.3.4" description = "Core API and plugin system for LEANN" readme = "README.md" requires-python = ">=3.9" diff --git a/apps/chunking/utils.py b/packages/leann-core/src/leann/chunking_utils.py similarity index 56% rename from apps/chunking/utils.py rename to packages/leann-core/src/leann/chunking_utils.py index 9a19c63..db80a39 100644 --- a/apps/chunking/utils.py +++ b/packages/leann-core/src/leann/chunking_utils.py @@ -1,6 +1,6 @@ """ Enhanced chunking utilities with AST-aware code chunking support. -Provides unified interface for both traditional and AST-based text chunking. +Packaged within leann-core so installed wheels can import it reliably. """ import logging @@ -22,30 +22,9 @@ CODE_EXTENSIONS = { ".jsx": "typescript", } -# Default chunk parameters for different content types -DEFAULT_CHUNK_PARAMS = { - "code": { - "max_chunk_size": 512, - "chunk_overlap": 64, - }, - "text": { - "chunk_size": 256, - "chunk_overlap": 128, - }, -} - def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: - """ - Separate documents into code files and regular text files. - - Args: - documents: List of LlamaIndex Document objects - code_extensions: Dict mapping file extensions to languages (defaults to CODE_EXTENSIONS) - - Returns: - Tuple of (code_documents, text_documents) - """ + """Separate documents into code files and regular text files.""" if code_extensions is None: code_extensions = CODE_EXTENSIONS @@ -53,16 +32,10 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: text_docs = [] for doc in documents: - # Get file path from metadata - file_path = doc.metadata.get("file_path", "") - if not file_path: - # Fallback to file_name - file_path = doc.metadata.get("file_name", "") - + file_path = doc.metadata.get("file_path", "") or doc.metadata.get("file_name", "") if file_path: file_ext = Path(file_path).suffix.lower() if file_ext in code_extensions: - # Add language info to metadata doc.metadata["language"] = code_extensions[file_ext] doc.metadata["is_code"] = True code_docs.append(doc) @@ -70,7 +43,6 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: doc.metadata["is_code"] = False text_docs.append(doc) else: - # If no file path, treat as text doc.metadata["is_code"] = False text_docs.append(doc) @@ -79,7 +51,7 @@ def detect_code_files(documents, code_extensions=None) -> tuple[list, list]: def get_language_from_extension(file_path: str) -> Optional[str]: - """Get the programming language from file extension.""" + """Return language string from a filename/extension using CODE_EXTENSIONS.""" ext = Path(file_path).suffix.lower() return CODE_EXTENSIONS.get(ext) @@ -90,40 +62,26 @@ def create_ast_chunks( chunk_overlap: int = 64, metadata_template: str = "default", ) -> list[str]: - """ - Create AST-aware chunks from code documents using astchunk. + """Create AST-aware chunks from code documents using astchunk. - Args: - documents: List of code documents - max_chunk_size: Maximum characters per chunk - chunk_overlap: Number of AST nodes to overlap between chunks - metadata_template: Template for chunk metadata - - Returns: - List of text chunks with preserved code structure + Falls back to traditional chunking if astchunk is unavailable. """ try: - from astchunk import ASTChunkBuilder + from astchunk import ASTChunkBuilder # optional dependency except ImportError as e: logger.error(f"astchunk not available: {e}") logger.info("Falling back to traditional chunking for code files") return create_traditional_chunks(documents, max_chunk_size, chunk_overlap) all_chunks = [] - for doc in documents: - # Get language from metadata (set by detect_code_files) language = doc.metadata.get("language") if not language: - logger.warning( - "No language detected for document, falling back to traditional chunking" - ) - traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) - all_chunks.extend(traditional_chunks) + logger.warning("No language detected; falling back to traditional chunking") + all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap)) continue try: - # Configure astchunk configs = { "max_chunk_size": max_chunk_size, "language": language, @@ -131,7 +89,6 @@ def create_ast_chunks( "chunk_overlap": chunk_overlap if chunk_overlap > 0 else 0, } - # Add repository-level metadata if available repo_metadata = { "file_path": doc.metadata.get("file_path", ""), "file_name": doc.metadata.get("file_name", ""), @@ -140,17 +97,13 @@ def create_ast_chunks( } configs["repo_level_metadata"] = repo_metadata - # Create chunk builder and process chunk_builder = ASTChunkBuilder(**configs) code_content = doc.get_content() - if not code_content or not code_content.strip(): logger.warning("Empty code content, skipping") continue chunks = chunk_builder.chunkify(code_content) - - # Extract text content from chunks for chunk in chunks: if hasattr(chunk, "text"): chunk_text = chunk.text @@ -159,7 +112,6 @@ def create_ast_chunks( elif isinstance(chunk, str): chunk_text = chunk else: - # Try to convert to string chunk_text = str(chunk) if chunk_text and chunk_text.strip(): @@ -168,12 +120,10 @@ def create_ast_chunks( logger.info( f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}" ) - except Exception as e: logger.warning(f"AST chunking failed for {language} file: {e}") logger.info("Falling back to traditional chunking") - traditional_chunks = create_traditional_chunks([doc], max_chunk_size, chunk_overlap) - all_chunks.extend(traditional_chunks) + all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap)) return all_chunks @@ -181,23 +131,10 @@ def create_ast_chunks( def create_traditional_chunks( documents, chunk_size: int = 256, chunk_overlap: int = 128 ) -> list[str]: - """ - Create traditional text chunks using LlamaIndex SentenceSplitter. - - Args: - documents: List of documents to chunk - chunk_size: Size of each chunk in characters - chunk_overlap: Overlap between chunks - - Returns: - List of text chunks - """ - # Handle invalid chunk_size values + """Create traditional text chunks using LlamaIndex SentenceSplitter.""" if chunk_size <= 0: logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256") chunk_size = 256 - - # Ensure chunk_overlap is not negative and not larger than chunk_size if chunk_overlap < 0: chunk_overlap = 0 if chunk_overlap >= chunk_size: @@ -215,12 +152,9 @@ def create_traditional_chunks( try: nodes = node_parser.get_nodes_from_documents([doc]) if nodes: - chunk_texts = [node.get_content() for node in nodes] - all_texts.extend(chunk_texts) - logger.debug(f"Created {len(chunk_texts)} traditional chunks from document") + all_texts.extend(node.get_content() for node in nodes) except Exception as e: logger.error(f"Traditional chunking failed for document: {e}") - # As last resort, add the raw content content = doc.get_content() if content and content.strip(): all_texts.append(content.strip()) @@ -238,32 +172,13 @@ def create_text_chunks( code_file_extensions: Optional[list[str]] = None, ast_fallback_traditional: bool = True, ) -> list[str]: - """ - Create text chunks from documents with optional AST support for code files. - - Args: - documents: List of LlamaIndex Document objects - chunk_size: Size for traditional text chunks - chunk_overlap: Overlap for traditional text chunks - use_ast_chunking: Whether to use AST chunking for code files - ast_chunk_size: Size for AST chunks - ast_chunk_overlap: Overlap for AST chunks - code_file_extensions: Custom list of code file extensions - ast_fallback_traditional: Fall back to traditional chunking on AST errors - - Returns: - List of text chunks - """ + """Create text chunks from documents with optional AST support for code files.""" if not documents: logger.warning("No documents provided for chunking") return [] - # Create a local copy of supported extensions for this function call local_code_extensions = CODE_EXTENSIONS.copy() - - # Update supported extensions if provided if code_file_extensions: - # Map extensions to languages (simplified mapping) ext_mapping = { ".py": "python", ".java": "java", @@ -273,47 +188,32 @@ def create_text_chunks( } for ext in code_file_extensions: if ext.lower() not in local_code_extensions: - # Try to guess language from extension if ext.lower() in ext_mapping: local_code_extensions[ext.lower()] = ext_mapping[ext.lower()] else: logger.warning(f"Unsupported extension {ext}, will use traditional chunking") all_chunks = [] - if use_ast_chunking: - # Separate code and text documents using local extensions code_docs, text_docs = detect_code_files(documents, local_code_extensions) - - # Process code files with AST chunking if code_docs: - logger.info(f"Processing {len(code_docs)} code files with AST chunking") try: - ast_chunks = create_ast_chunks( - code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap + all_chunks.extend( + create_ast_chunks( + code_docs, max_chunk_size=ast_chunk_size, chunk_overlap=ast_chunk_overlap + ) ) - all_chunks.extend(ast_chunks) - logger.info(f"Created {len(ast_chunks)} AST chunks from code files") except Exception as e: logger.error(f"AST chunking failed: {e}") if ast_fallback_traditional: - logger.info("Falling back to traditional chunking for code files") - traditional_code_chunks = create_traditional_chunks( - code_docs, chunk_size, chunk_overlap + all_chunks.extend( + create_traditional_chunks(code_docs, chunk_size, chunk_overlap) ) - all_chunks.extend(traditional_code_chunks) else: raise - - # Process text files with traditional chunking if text_docs: - logger.info(f"Processing {len(text_docs)} text files with traditional chunking") - text_chunks = create_traditional_chunks(text_docs, chunk_size, chunk_overlap) - all_chunks.extend(text_chunks) - logger.info(f"Created {len(text_chunks)} traditional chunks from text files") + all_chunks.extend(create_traditional_chunks(text_docs, chunk_size, chunk_overlap)) else: - # Use traditional chunking for all files - logger.info(f"Processing {len(documents)} documents with traditional chunking") all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap) logger.info(f"Total chunks created: {len(all_chunks)}") diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index caad276..2d514e2 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1,6 +1,5 @@ import argparse import asyncio -import sys from pathlib import Path from typing import Any, Optional, Union @@ -1216,13 +1215,8 @@ Examples: if use_ast: print("🧠 Using AST-aware chunking for code files") try: - # Import enhanced chunking utilities - # Add apps directory to path to import chunking utilities - apps_dir = Path(__file__).parent.parent.parent.parent.parent / "apps" - if apps_dir.exists(): - sys.path.insert(0, str(apps_dir)) - - from chunking import create_text_chunks + # Import enhanced chunking utilities from packaged module + from .chunking_utils import create_text_chunks # Use enhanced chunking with AST support all_texts = create_text_chunks( @@ -1237,7 +1231,9 @@ Examples: ) except ImportError as e: - print(f"⚠️ AST chunking not available ({e}), falling back to traditional chunking") + print( + f"⚠️ AST chunking utilities not available in package ({e}), falling back to traditional chunking" + ) use_ast = False if not use_ast: diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index 41c54d0..5106f9e 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann" -version = "0.3.3" +version = "0.3.4" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" readme = "README.md" requires-python = ">=3.9"