diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index f0b7b24..d0bfb16 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -54,6 +54,17 @@ jobs: python: '3.12' - os: ubuntu-22.04 python: '3.13' + # ARM64 Linux builds + - os: ubuntu-24.04-arm + python: '3.9' + - os: ubuntu-24.04-arm + python: '3.10' + - os: ubuntu-24.04-arm + python: '3.11' + - os: ubuntu-24.04-arm + python: '3.12' + - os: ubuntu-24.04-arm + python: '3.13' - os: macos-14 python: '3.9' - os: macos-14 @@ -108,13 +119,46 @@ jobs: pkg-config libabsl-dev libaio-dev libprotobuf-dev \ patchelf - # Install Intel MKL for DiskANN - wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh - sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - source /opt/intel/oneapi/setvars.sh - echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV + # Debug: Show system information + echo "πŸ” System Information:" + echo "Architecture: $(uname -m)" + echo "OS: $(uname -a)" + echo "CPU info: $(lscpu | head -5)" + + # Install math library based on architecture + ARCH=$(uname -m) + echo "πŸ” Setting up math library for architecture: $ARCH" + + if [[ "$ARCH" == "x86_64" ]]; then + # Install Intel MKL for DiskANN on x86_64 + echo "πŸ“¦ Installing Intel MKL for x86_64..." + wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh + sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s + source /opt/intel/oneapi/setvars.sh + echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV + echo "βœ… Intel MKL installed for x86_64" + + # Debug: Check MKL installation + echo "πŸ” MKL Installation Check:" + ls -la /opt/intel/oneapi/mkl/latest/ || echo "MKL directory not found" + ls -la /opt/intel/oneapi/mkl/latest/lib/ || echo "MKL lib directory not found" + + elif [[ "$ARCH" == "aarch64" ]]; then + # Use OpenBLAS for ARM64 (MKL installer not compatible with ARM64) + echo "πŸ“¦ Installing OpenBLAS for ARM64..." + sudo apt-get install -y libopenblas-dev liblapack-dev liblapacke-dev + echo "βœ… OpenBLAS installed for ARM64" + + # Debug: Check OpenBLAS installation + echo "πŸ” OpenBLAS Installation Check:" + dpkg -l | grep openblas || echo "OpenBLAS package not found" + ls -la /usr/lib/aarch64-linux-gnu/openblas/ || echo "OpenBLAS directory not found" + fi + + # Debug: Show final library paths + echo "πŸ” Final LD_LIBRARY_PATH: $LD_LIBRARY_PATH" - name: Install system dependencies (macOS) if: runner.os == 'macOS' diff --git a/.gitignore b/.gitignore index ab892ee..9b106b1 100755 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ demo/experiment_results/**/*.json *.sh *.txt !CMakeLists.txt +!llms.txt latency_breakdown*.json experiment_results/eval_results/diskann/*.json aws/ diff --git a/.gitmodules b/.gitmodules index c1cd540..aa2e98e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,7 @@ [submodule "packages/leann-backend-hnsw/third_party/libzmq"] path = packages/leann-backend-hnsw/third_party/libzmq url = https://github.com/zeromq/libzmq.git +[submodule "packages/astchunk-leann"] + path = packages/astchunk-leann + url = git@github.com:yichuan-w/astchunk-leann.git + branch = main diff --git a/README.md b/README.md index 6b35801..90c0b33 100755 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Platform MIT License MCP Integration + Join Slack + Join WeChat group

@@ -654,6 +656,19 @@ results = searcher.search( πŸ“– **[Complete Metadata filtering guide β†’](docs/metadata_filtering.md)** +### πŸ” Grep Search + +For exact text matching instead of semantic search, use the `use_grep` parameter: + +```python +# Exact text search +results = searcher.search("banana‑crocodile", use_grep=True, top_k=1) +``` + +**Use cases**: Finding specific code patterns, error messages, function names, or exact phrases where semantic similarity isn't needed. + +πŸ“– **[Complete grep search guide β†’](docs/grep_search.md)** + ## πŸ—οΈ Architecture & How It Works

diff --git a/assets/wechat_user_group.JPG b/assets/wechat_user_group.JPG new file mode 100644 index 0000000..ab6236e Binary files /dev/null and b/assets/wechat_user_group.JPG differ diff --git a/docs/ast_chunking_guide.md b/docs/ast_chunking_guide.md index dd5be37..34d7ccb 100644 --- a/docs/ast_chunking_guide.md +++ b/docs/ast_chunking_guide.md @@ -26,6 +26,21 @@ leann build my-code-index --docs ./src --use-ast-chunking uv pip install -e "." ``` +#### For normal users (PyPI install) +- Use `pip install leann` or `uv pip install leann`. +- `astchunk` is pulled automatically from PyPI as a dependency; no extra steps. + +#### For developers (from source, editable) +```bash +git clone https://github.com/yichuan-w/LEANN.git leann +cd leann +git submodule update --init --recursive +uv sync +``` +- This repo vendors `astchunk` as a git submodule at `packages/astchunk-leann` (our fork). +- `[tool.uv.sources]` maps the `astchunk` package to that path in editable mode. +- You can edit code under `packages/astchunk-leann` and Python will use your changes immediately (no separate `pip install astchunk` needed). + ## Best Practices ### When to Use AST Chunking diff --git a/docs/grep_search.md b/docs/grep_search.md new file mode 100644 index 0000000..4fe002f --- /dev/null +++ b/docs/grep_search.md @@ -0,0 +1,149 @@ +# LEANN Grep Search Usage Guide + +## Overview + +LEANN's grep search functionality provides exact text matching for finding specific code patterns, error messages, function names, or exact phrases in your indexed documents. + +## Basic Usage + +### Simple Grep Search + +```python +from leann.api import LeannSearcher + +searcher = LeannSearcher("your_index_path") + +# Exact text search +results = searcher.search("def authenticate_user", use_grep=True, top_k=5) + +for result in results: + print(f"Score: {result.score}") + print(f"Text: {result.text[:100]}...") + print("-" * 40) +``` + +### Comparison: Semantic vs Grep Search + +```python +# Semantic search - finds conceptually similar content +semantic_results = searcher.search("machine learning algorithms", top_k=3) + +# Grep search - finds exact text matches +grep_results = searcher.search("def train_model", use_grep=True, top_k=3) +``` + +## When to Use Grep Search + +### Use Cases + +- **Code Search**: Finding specific function definitions, class names, or variable references +- **Error Debugging**: Locating exact error messages or stack traces +- **Documentation**: Finding specific API endpoints or exact terminology + +### Examples + +```python +# Find function definitions +functions = searcher.search("def __init__", use_grep=True) + +# Find import statements +imports = searcher.search("from sklearn import", use_grep=True) + +# Find specific error types +errors = searcher.search("FileNotFoundError", use_grep=True) + +# Find TODO comments +todos = searcher.search("TODO:", use_grep=True) + +# Find configuration entries +configs = searcher.search("server_port=", use_grep=True) +``` + +## Technical Details + +### How It Works + +1. **File Location**: Grep search operates on the raw text stored in `.jsonl` files +2. **Command Execution**: Uses the system `grep` command with case-insensitive search +3. **Result Processing**: Parses JSON lines and extracts text and metadata +4. **Scoring**: Simple frequency-based scoring based on query term occurrences + +### Search Process + +``` +Query: "def train_model" + ↓ +grep -i -n "def train_model" documents.leann.passages.jsonl + ↓ +Parse matching JSON lines + ↓ +Calculate scores based on term frequency + ↓ +Return top_k results +``` + +### Scoring Algorithm + +```python +# Term frequency in document +score = text.lower().count(query.lower()) +``` + +Results are ranked by score (highest first), with higher scores indicating more occurrences of the search term. + +## Error Handling + +### Common Issues + +#### Grep Command Not Found +``` +RuntimeError: grep command not found. Please install grep or use semantic search. +``` + +**Solution**: Install grep on your system: +- **Ubuntu/Debian**: `sudo apt-get install grep` +- **macOS**: grep is pre-installed +- **Windows**: Use WSL or install grep via Git Bash/MSYS2 + +#### No Results Found +```python +# Check if your query exists in the raw data +results = searcher.search("your_query", use_grep=True) +if not results: + print("No exact matches found. Try:") + print("1. Check spelling and case") + print("2. Use partial terms") + print("3. Switch to semantic search") +``` + +## Complete Example + +```python +#!/usr/bin/env python3 +""" +Grep Search Example +Demonstrates grep search for exact text matching. +""" + +from leann.api import LeannSearcher + +def demonstrate_grep_search(): + # Initialize searcher + searcher = LeannSearcher("my_index") + + print("=== Function Search ===") + functions = searcher.search("def __init__", use_grep=True, top_k=5) + for i, result in enumerate(functions, 1): + print(f"{i}. Score: {result.score}") + print(f" Preview: {result.text[:60]}...") + print() + + print("=== Error Search ===") + errors = searcher.search("FileNotFoundError", use_grep=True, top_k=3) + for result in errors: + print(f"Content: {result.text.strip()}") + print("-" * 40) + +if __name__ == "__main__": + demonstrate_grep_search() +``` diff --git a/examples/grep_search_example.py b/examples/grep_search_example.py new file mode 100644 index 0000000..71723ab --- /dev/null +++ b/examples/grep_search_example.py @@ -0,0 +1,35 @@ +""" +Grep Search Example + +Shows how to use grep-based text search instead of semantic search. +Useful when you need exact text matches rather than meaning-based results. +""" + +from leann import LeannSearcher + +# Load your index +searcher = LeannSearcher("my-documents.leann") + +# Regular semantic search +print("=== Semantic Search ===") +results = searcher.search("machine learning algorithms", top_k=3) +for result in results: + print(f"Score: {result.score:.3f}") + print(f"Text: {result.text[:80]}...") + print() + +# Grep-based search for exact text matches +print("=== Grep Search ===") +results = searcher.search("def train_model", top_k=3, use_grep=True) +for result in results: + print(f"Score: {result.score}") + print(f"Text: {result.text[:80]}...") + print() + +# Find specific error messages +error_results = searcher.search("FileNotFoundError", use_grep=True) +print(f"Found {len(error_results)} files mentioning FileNotFoundError") + +# Search for function definitions +func_results = searcher.search("class SearchResult", use_grep=True, top_k=5) +print(f"Found {len(func_results)} class definitions") diff --git a/llms.txt b/llms.txt new file mode 100644 index 0000000..e470008 --- /dev/null +++ b/llms.txt @@ -0,0 +1,28 @@ +# llms.txt β€” LEANN MCP and Agent Integration +product: LEANN +homepage: https://github.com/yichuan-w/LEANN +contact: https://github.com/yichuan-w/LEANN/issues + +# Installation +install: uv tool install leann-core --with leann + +# MCP Server Entry Point +mcp.server: leann_mcp +mcp.protocol_version: 2024-11-05 + +# Tools +mcp.tools: leann_list, leann_search + +mcp.tool.leann_list.description: List available LEANN indexes +mcp.tool.leann_list.input: {} + +mcp.tool.leann_search.description: Semantic search across a named LEANN index +mcp.tool.leann_search.input.index_name: string, required +mcp.tool.leann_search.input.query: string, required +mcp.tool.leann_search.input.top_k: integer, optional, default=5, min=1, max=20 +mcp.tool.leann_search.input.complexity: integer, optional, default=32, min=16, max=128 + +# Notes +note: Build indexes with `leann build --docs ` before searching. +example.add: claude mcp add --scope user leann-server -- leann_mcp +example.verify: claude mcp list | cat diff --git a/packages/astchunk-leann b/packages/astchunk-leann new file mode 160000 index 0000000..a453701 --- /dev/null +++ b/packages/astchunk-leann @@ -0,0 +1 @@ +Subproject commit a4537018a329ba96f187b1d97c15abd1a04b8093 diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml index a98396a..07be0ac 100644 --- a/packages/leann-backend-diskann/pyproject.toml +++ b/packages/leann-backend-diskann/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-diskann" -version = "0.3.2" -dependencies = ["leann-core==0.3.2", "numpy", "protobuf>=3.19.0"] +version = "0.3.3" +dependencies = ["leann-core==0.3.3", "numpy", "protobuf>=3.19.0"] [tool.scikit-build] # Key: simplified CMake path diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN index c593831..19f9603 160000 --- a/packages/leann-backend-diskann/third_party/DiskANN +++ b/packages/leann-backend-diskann/third_party/DiskANN @@ -1 +1 @@ -Subproject commit c593831474afb26bf167b077c2f0956ddbc54603 +Subproject commit 19f9603c728f51ff4a37df78805a3bb183e9870d diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt index 12e19ef..87d4592 100644 --- a/packages/leann-backend-hnsw/CMakeLists.txt +++ b/packages/leann-backend-hnsw/CMakeLists.txt @@ -49,9 +49,28 @@ set(BUILD_TESTING OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE) set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE) -# Disable additional SIMD versions to speed up compilation +# Disable x86-specific SIMD optimizations (important for ARM64 compatibility) set(FAISS_ENABLE_AVX2 OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_AVX512 OFF CACHE BOOL "" FORCE) +set(FAISS_ENABLE_SSE4_1 OFF CACHE BOOL "" FORCE) + +# ARM64-specific configuration +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + message(STATUS "Configuring Faiss for ARM64 architecture") + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Use SVE optimization level for ARM64 Linux (as seen in Faiss conda build) + set(FAISS_OPT_LEVEL "sve" CACHE STRING "" FORCE) + message(STATUS "Setting FAISS_OPT_LEVEL to 'sve' for ARM64 Linux") + else() + # Use generic optimization for other ARM64 platforms (like macOS) + set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE) + message(STATUS "Setting FAISS_OPT_LEVEL to 'generic' for ARM64 ${CMAKE_SYSTEM_NAME}") + endif() + + # ARM64 compatibility: Faiss submodule has been modified to fix x86 header inclusion + message(STATUS "Using ARM64-compatible Faiss submodule") +endif() # Additional optimization options from INSTALL.md set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml index 0543bb3..3456ac8 100644 --- a/packages/leann-backend-hnsw/pyproject.toml +++ b/packages/leann-backend-hnsw/pyproject.toml @@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build" [project] name = "leann-backend-hnsw" -version = "0.3.2" +version = "0.3.3" description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." dependencies = [ - "leann-core==0.3.2", + "leann-core==0.3.3", "numpy", "pyzmq>=23.0.0", "msgpack>=1.0.0", diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index 4a2c0d6..ed96ff7 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit 4a2c0d67d37a6f27c9a1cd695a3d703dcce73bad +Subproject commit ed96ff7dbaea0562b994f8ce7823af41884b1010 diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml index c47aa90..82a65d9 100644 --- a/packages/leann-core/pyproject.toml +++ b/packages/leann-core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann-core" -version = "0.3.2" +version = "0.3.3" description = "Core API and plugin system for LEANN" readme = "README.md" requires-python = ">=3.9" diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 49f61a6..d808e66 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -6,6 +6,8 @@ with the correct, original embedding logic from the user's reference code. import json import logging import pickle +import re +import subprocess import time import warnings from dataclasses import dataclass, field @@ -675,6 +677,7 @@ class LeannSearcher: expected_zmq_port: int = 5557, metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None, batch_size: int = 0, + use_grep: bool = False, **kwargs, ) -> list[SearchResult]: """ @@ -701,6 +704,10 @@ class LeannSearcher: Returns: List of SearchResult objects with text, metadata, and similarity scores """ + # Handle grep search + if use_grep: + return self._grep_search(query, top_k) + logger.info("πŸ” LeannSearcher.search() called:") logger.info(f" Query: '{query}'") logger.info(f" Top_k: {top_k}") @@ -817,9 +824,96 @@ class LeannSearcher: logger.info(f" {GREEN}βœ“ Final enriched results: {len(enriched_results)} passages{RESET}") return enriched_results + def _find_jsonl_file(self) -> Optional[str]: + """Find the .jsonl file containing raw passages for grep search""" + index_path = Path(self.meta_path_str).parent + potential_files = [ + index_path / "documents.leann.passages.jsonl", + index_path.parent / "documents.leann.passages.jsonl", + ] + + for file_path in potential_files: + if file_path.exists(): + return str(file_path) + return None + + def _grep_search(self, query: str, top_k: int = 5) -> list[SearchResult]: + """Perform grep-based search on raw passages""" + jsonl_file = self._find_jsonl_file() + if not jsonl_file: + raise FileNotFoundError("No .jsonl passages file found for grep search") + + try: + cmd = ["grep", "-i", "-n", query, jsonl_file] + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + + if result.returncode == 1: + return [] + elif result.returncode != 0: + raise RuntimeError(f"Grep failed: {result.stderr}") + + matches = [] + for line in result.stdout.strip().split("\n"): + if not line: + continue + parts = line.split(":", 1) + if len(parts) != 2: + continue + + try: + data = json.loads(parts[1]) + text = data.get("text", "") + score = text.lower().count(query.lower()) + + matches.append( + SearchResult( + id=data.get("id", parts[0]), + text=text, + metadata=data.get("metadata", {}), + score=float(score), + ) + ) + except json.JSONDecodeError: + continue + + matches.sort(key=lambda x: x.score, reverse=True) + return matches[:top_k] + + except FileNotFoundError: + raise RuntimeError( + "grep command not found. Please install grep or use semantic search." + ) + + def _python_regex_search(self, query: str, top_k: int = 5) -> list[SearchResult]: + """Fallback regex search""" + jsonl_file = self._find_jsonl_file() + if not jsonl_file: + raise FileNotFoundError("No .jsonl file found") + + pattern = re.compile(re.escape(query), re.IGNORECASE) + matches = [] + + with open(jsonl_file, encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + if pattern.search(line): + try: + data = json.loads(line.strip()) + matches.append( + SearchResult( + id=data.get("id", str(line_num)), + text=data.get("text", ""), + metadata=data.get("metadata", {}), + score=float(len(pattern.findall(data.get("text", "")))), + ) + ) + except json.JSONDecodeError: + continue + + matches.sort(key=lambda x: x.score, reverse=True) + return matches[:top_k] + def cleanup(self): """Explicitly cleanup embedding server resources. - This method should be called after you're done using the searcher, especially in test environments or batch processing scenarios. """ @@ -875,6 +969,7 @@ class LeannChat: expected_zmq_port: int = 5557, metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None, batch_size: int = 0, + use_grep: bool = False, **search_kwargs, ): if llm_kwargs is None: diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 5a2611a..caad276 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -322,9 +322,17 @@ Examples: return basic_matches - def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool: - """Check if a file should be excluded using gitignore parser.""" - return gitignore_matches(str(relative_path)) + def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool: + """Check if a file should be excluded using gitignore parser. + + Always match against absolute, posix-style paths for consistency with + gitignore_parser expectations. + """ + try: + absolute_path = file_path.resolve() + except Exception: + absolute_path = Path(str(file_path)) + return gitignore_matches(absolute_path.as_posix()) def _is_git_submodule(self, path: Path) -> bool: """Check if a path is a git submodule.""" @@ -396,7 +404,9 @@ Examples: print(f" {current_path}") print(" " + "─" * 45) - current_indexes = self._discover_indexes_in_project(current_path) + current_indexes = self._discover_indexes_in_project( + current_path, exclude_dirs=other_projects + ) if current_indexes: for idx in current_indexes: total_indexes += 1 @@ -435,9 +445,14 @@ Examples: print(" leann build my-docs --docs ./documents") else: # Count only projects that have at least one discoverable index - projects_count = sum( - 1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0 - ) + projects_count = 0 + for p in valid_projects: + if p == current_path: + discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects) + else: + discovered = self._discover_indexes_in_project(p) + if len(discovered) > 0: + projects_count += 1 print(f"πŸ“Š Total: {total_indexes} indexes across {projects_count} projects") if current_indexes_count > 0: @@ -454,9 +469,22 @@ Examples: print("\nπŸ’‘ Create your first index:") print(" leann build my-docs --docs ./documents") - def _discover_indexes_in_project(self, project_path: Path): - """Discover all indexes in a project directory (both CLI and apps formats)""" + def _discover_indexes_in_project( + self, project_path: Path, exclude_dirs: Optional[list[Path]] = None + ): + """Discover all indexes in a project directory (both CLI and apps formats) + + exclude_dirs: when provided, skip any APP-format index files that are + located under these directories. This prevents duplicates when the + current project is a parent directory of other registered projects. + """ indexes = [] + exclude_dirs = exclude_dirs or [] + # normalize to resolved paths once for comparison + try: + exclude_dirs_resolved = [p.resolve() for p in exclude_dirs] + except Exception: + exclude_dirs_resolved = exclude_dirs # 1. CLI format: .leann/indexes/index_name/ cli_indexes_dir = project_path / ".leann" / "indexes" @@ -495,6 +523,17 @@ Examples: continue except Exception: pass + # Skip meta files that live under excluded directories + try: + meta_parent_resolved = meta_file.parent.resolve() + if any( + meta_parent_resolved.is_relative_to(ex_dir) + for ex_dir in exclude_dirs_resolved + ): + continue + except Exception: + # best effort; if resolve or comparison fails, do not exclude + pass # Use the parent directory name as the app index display name display_name = meta_file.parent.name # Extract file base used to store files @@ -1022,7 +1061,8 @@ Examples: # Try to use better PDF parsers first, but only if PDFs are requested documents = [] - docs_path = Path(docs_dir) + # Use resolved absolute paths to avoid mismatches (symlinks, relative vs absolute) + docs_path = Path(docs_dir).resolve() # Check if we should process PDFs should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types @@ -1031,10 +1071,15 @@ Examples: for file_path in docs_path.rglob("*.pdf"): # Check if file matches any exclude pattern try: + # Ensure both paths are resolved before computing relativity + file_path_resolved = file_path.resolve() + # Determine directory scope using the non-resolved path to avoid + # misclassifying symlinked entries as outside the docs directory relative_path = file_path.relative_to(docs_path) if not include_hidden and _path_has_hidden_segment(relative_path): continue - if self._should_exclude_file(relative_path, gitignore_matches): + # Use absolute path for gitignore matching + if self._should_exclude_file(file_path_resolved, gitignore_matches): continue except ValueError: # Skip files that can't be made relative to docs_path @@ -1077,10 +1122,11 @@ Examples: ) -> bool: """Return True if file should be included (not excluded)""" try: - docs_path_obj = Path(docs_dir) - file_path_obj = Path(file_path) - relative_path = file_path_obj.relative_to(docs_path_obj) - return not self._should_exclude_file(relative_path, gitignore_matches) + docs_path_obj = Path(docs_dir).resolve() + file_path_obj = Path(file_path).resolve() + # Use absolute path for gitignore matching + _ = file_path_obj.relative_to(docs_path_obj) # validate scope + return not self._should_exclude_file(file_path_obj, gitignore_matches) except (ValueError, OSError): return True # Include files that can't be processed diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md index cf783d4..356065d 100644 --- a/packages/leann-mcp/README.md +++ b/packages/leann-mcp/README.md @@ -2,6 +2,8 @@ Transform your development workflow with intelligent code assistance using LEANN's semantic search directly in Claude Code. +For agent-facing discovery details, see `llms.txt` in the repository root. + ## Prerequisites Install LEANN globally for MCP integration (with default backend): diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml index 52d0bc9..41c54d0 100644 --- a/packages/leann/pyproject.toml +++ b/packages/leann/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "leann" -version = "0.3.2" +version = "0.3.3" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" readme = "README.md" requires-python = ">=3.9" diff --git a/pyproject.toml b/pyproject.toml index 35e5613..a0d83bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,7 @@ wechat-exporter = "wechat_exporter.main:main" leann-core = { path = "packages/leann-core", editable = true } leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = true } leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true } +astchunk = { path = "packages/astchunk-leann", editable = true } [tool.ruff] target-version = "py39" diff --git a/uv.lock b/uv.lock index d01612b..28c5824 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", @@ -201,7 +201,7 @@ wheels = [ [[package]] name = "astchunk" version = "0.1.0" -source = { registry = "https://pypi.org/simple" } +source = { editable = "packages/astchunk-leann" } dependencies = [ { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, @@ -214,10 +214,31 @@ dependencies = [ { name = "tree-sitter-python" }, { name = "tree-sitter-typescript" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/db/2a/7a35e2fac7d550265ae2ee40651425083b37555f921d1a1b77c3f525e0df/astchunk-0.1.0.tar.gz", hash = "sha256:f4dff0ef8b3b3bcfeac363384db1e153f74d4c825dc2e35864abfab027713be4", size = 18093, upload-time = "2025-06-19T04:37:25.34Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/be/84/5433ab0e933b572750cb16fd7edf3d6c7902b069461a22ec670042752a4d/astchunk-0.1.0-py3-none-any.whl", hash = "sha256:33ada9fc3620807fdda5846fa1948af463f281a60e0d43d4f3782b6dbb416d24", size = 15396, upload-time = "2025-06-19T04:37:23.87Z" }, + +[package.metadata] +requires-dist = [ + { name = "black", marker = "extra == 'dev'", specifier = ">=22.0.0" }, + { name = "flake8", marker = "extra == 'dev'", specifier = ">=5.0.0" }, + { name = "isort", marker = "extra == 'dev'", specifier = ">=5.10.0" }, + { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.0.0" }, + { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=0.18.0" }, + { name = "numpy", specifier = ">=1.20.0" }, + { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=2.20.0" }, + { name = "pyrsistent", specifier = ">=0.18.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" }, + { name = "pytest", marker = "extra == 'test'", specifier = ">=7.0.0" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" }, + { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" }, + { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=2.5.0" }, + { name = "sphinx", marker = "extra == 'docs'", specifier = ">=5.0.0" }, + { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=1.0.0" }, + { name = "tree-sitter", specifier = ">=0.20.0" }, + { name = "tree-sitter-c-sharp", specifier = ">=0.20.0" }, + { name = "tree-sitter-java", specifier = ">=0.20.0" }, + { name = "tree-sitter-python", specifier = ">=0.20.0" }, + { name = "tree-sitter-typescript", specifier = ">=0.20.0" }, ] +provides-extras = ["dev", "docs", "test"] [[package]] name = "asttokens" @@ -2117,7 +2138,7 @@ wheels = [ [[package]] name = "leann-backend-diskann" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-backend-diskann" } dependencies = [ { name = "leann-core" }, @@ -2129,14 +2150,14 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.2" }, + { name = "leann-core", specifier = "==0.3.3" }, { name = "numpy" }, { name = "protobuf", specifier = ">=3.19.0" }, ] [[package]] name = "leann-backend-hnsw" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-backend-hnsw" } dependencies = [ { name = "leann-core" }, @@ -2149,7 +2170,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "leann-core", specifier = "==0.3.2" }, + { name = "leann-core", specifier = "==0.3.3" }, { name = "msgpack", specifier = ">=1.0.0" }, { name = "numpy" }, { name = "pyzmq", specifier = ">=23.0.0" }, @@ -2157,7 +2178,7 @@ requires-dist = [ [[package]] name = "leann-core" -version = "0.3.2" +version = "0.3.3" source = { editable = "packages/leann-core" } dependencies = [ { name = "accelerate" }, @@ -2298,7 +2319,7 @@ test = [ [package.metadata] requires-dist = [ - { name = "astchunk", specifier = ">=0.1.0" }, + { name = "astchunk", editable = "packages/astchunk-leann" }, { name = "beautifulsoup4", marker = "extra == 'documents'", specifier = ">=4.13.0" }, { name = "black", marker = "extra == 'dev'", specifier = ">=23.0" }, { name = "boto3" },