diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml
index f0b7b24..d0bfb16 100644
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -54,6 +54,17 @@ jobs:
python: '3.12'
- os: ubuntu-22.04
python: '3.13'
+ # ARM64 Linux builds
+ - os: ubuntu-24.04-arm
+ python: '3.9'
+ - os: ubuntu-24.04-arm
+ python: '3.10'
+ - os: ubuntu-24.04-arm
+ python: '3.11'
+ - os: ubuntu-24.04-arm
+ python: '3.12'
+ - os: ubuntu-24.04-arm
+ python: '3.13'
- os: macos-14
python: '3.9'
- os: macos-14
@@ -108,13 +119,46 @@ jobs:
pkg-config libabsl-dev libaio-dev libprotobuf-dev \
patchelf
- # Install Intel MKL for DiskANN
- wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
- sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
- source /opt/intel/oneapi/setvars.sh
- echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
- echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV
- echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
+ # Debug: Show system information
+ echo "π System Information:"
+ echo "Architecture: $(uname -m)"
+ echo "OS: $(uname -a)"
+ echo "CPU info: $(lscpu | head -5)"
+
+ # Install math library based on architecture
+ ARCH=$(uname -m)
+ echo "π Setting up math library for architecture: $ARCH"
+
+ if [[ "$ARCH" == "x86_64" ]]; then
+ # Install Intel MKL for DiskANN on x86_64
+ echo "π¦ Installing Intel MKL for x86_64..."
+ wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
+ sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
+ source /opt/intel/oneapi/setvars.sh
+ echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
+ echo "β
Intel MKL installed for x86_64"
+
+ # Debug: Check MKL installation
+ echo "π MKL Installation Check:"
+ ls -la /opt/intel/oneapi/mkl/latest/ || echo "MKL directory not found"
+ ls -la /opt/intel/oneapi/mkl/latest/lib/ || echo "MKL lib directory not found"
+
+ elif [[ "$ARCH" == "aarch64" ]]; then
+ # Use OpenBLAS for ARM64 (MKL installer not compatible with ARM64)
+ echo "π¦ Installing OpenBLAS for ARM64..."
+ sudo apt-get install -y libopenblas-dev liblapack-dev liblapacke-dev
+ echo "β
OpenBLAS installed for ARM64"
+
+ # Debug: Check OpenBLAS installation
+ echo "π OpenBLAS Installation Check:"
+ dpkg -l | grep openblas || echo "OpenBLAS package not found"
+ ls -la /usr/lib/aarch64-linux-gnu/openblas/ || echo "OpenBLAS directory not found"
+ fi
+
+ # Debug: Show final library paths
+ echo "π Final LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
- name: Install system dependencies (macOS)
if: runner.os == 'macOS'
diff --git a/.gitignore b/.gitignore
index ab892ee..9b106b1 100755
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ demo/experiment_results/**/*.json
*.sh
*.txt
!CMakeLists.txt
+!llms.txt
latency_breakdown*.json
experiment_results/eval_results/diskann/*.json
aws/
diff --git a/.gitmodules b/.gitmodules
index c1cd540..aa2e98e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -14,3 +14,7 @@
[submodule "packages/leann-backend-hnsw/third_party/libzmq"]
path = packages/leann-backend-hnsw/third_party/libzmq
url = https://github.com/zeromq/libzmq.git
+[submodule "packages/astchunk-leann"]
+ path = packages/astchunk-leann
+ url = git@github.com:yichuan-w/astchunk-leann.git
+ branch = main
diff --git a/README.md b/README.md
index 6b35801..90c0b33 100755
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@
+
+
@@ -654,6 +656,19 @@ results = searcher.search(
π **[Complete Metadata filtering guide β](docs/metadata_filtering.md)**
+### π Grep Search
+
+For exact text matching instead of semantic search, use the `use_grep` parameter:
+
+```python
+# Exact text search
+results = searcher.search("bananaβcrocodile", use_grep=True, top_k=1)
+```
+
+**Use cases**: Finding specific code patterns, error messages, function names, or exact phrases where semantic similarity isn't needed.
+
+π **[Complete grep search guide β](docs/grep_search.md)**
+
## ποΈ Architecture & How It Works
diff --git a/assets/wechat_user_group.JPG b/assets/wechat_user_group.JPG
new file mode 100644
index 0000000..ab6236e
Binary files /dev/null and b/assets/wechat_user_group.JPG differ
diff --git a/docs/ast_chunking_guide.md b/docs/ast_chunking_guide.md
index dd5be37..34d7ccb 100644
--- a/docs/ast_chunking_guide.md
+++ b/docs/ast_chunking_guide.md
@@ -26,6 +26,21 @@ leann build my-code-index --docs ./src --use-ast-chunking
uv pip install -e "."
```
+#### For normal users (PyPI install)
+- Use `pip install leann` or `uv pip install leann`.
+- `astchunk` is pulled automatically from PyPI as a dependency; no extra steps.
+
+#### For developers (from source, editable)
+```bash
+git clone https://github.com/yichuan-w/LEANN.git leann
+cd leann
+git submodule update --init --recursive
+uv sync
+```
+- This repo vendors `astchunk` as a git submodule at `packages/astchunk-leann` (our fork).
+- `[tool.uv.sources]` maps the `astchunk` package to that path in editable mode.
+- You can edit code under `packages/astchunk-leann` and Python will use your changes immediately (no separate `pip install astchunk` needed).
+
## Best Practices
### When to Use AST Chunking
diff --git a/docs/grep_search.md b/docs/grep_search.md
new file mode 100644
index 0000000..4fe002f
--- /dev/null
+++ b/docs/grep_search.md
@@ -0,0 +1,149 @@
+# LEANN Grep Search Usage Guide
+
+## Overview
+
+LEANN's grep search functionality provides exact text matching for finding specific code patterns, error messages, function names, or exact phrases in your indexed documents.
+
+## Basic Usage
+
+### Simple Grep Search
+
+```python
+from leann.api import LeannSearcher
+
+searcher = LeannSearcher("your_index_path")
+
+# Exact text search
+results = searcher.search("def authenticate_user", use_grep=True, top_k=5)
+
+for result in results:
+ print(f"Score: {result.score}")
+ print(f"Text: {result.text[:100]}...")
+ print("-" * 40)
+```
+
+### Comparison: Semantic vs Grep Search
+
+```python
+# Semantic search - finds conceptually similar content
+semantic_results = searcher.search("machine learning algorithms", top_k=3)
+
+# Grep search - finds exact text matches
+grep_results = searcher.search("def train_model", use_grep=True, top_k=3)
+```
+
+## When to Use Grep Search
+
+### Use Cases
+
+- **Code Search**: Finding specific function definitions, class names, or variable references
+- **Error Debugging**: Locating exact error messages or stack traces
+- **Documentation**: Finding specific API endpoints or exact terminology
+
+### Examples
+
+```python
+# Find function definitions
+functions = searcher.search("def __init__", use_grep=True)
+
+# Find import statements
+imports = searcher.search("from sklearn import", use_grep=True)
+
+# Find specific error types
+errors = searcher.search("FileNotFoundError", use_grep=True)
+
+# Find TODO comments
+todos = searcher.search("TODO:", use_grep=True)
+
+# Find configuration entries
+configs = searcher.search("server_port=", use_grep=True)
+```
+
+## Technical Details
+
+### How It Works
+
+1. **File Location**: Grep search operates on the raw text stored in `.jsonl` files
+2. **Command Execution**: Uses the system `grep` command with case-insensitive search
+3. **Result Processing**: Parses JSON lines and extracts text and metadata
+4. **Scoring**: Simple frequency-based scoring based on query term occurrences
+
+### Search Process
+
+```
+Query: "def train_model"
+ β
+grep -i -n "def train_model" documents.leann.passages.jsonl
+ β
+Parse matching JSON lines
+ β
+Calculate scores based on term frequency
+ β
+Return top_k results
+```
+
+### Scoring Algorithm
+
+```python
+# Term frequency in document
+score = text.lower().count(query.lower())
+```
+
+Results are ranked by score (highest first), with higher scores indicating more occurrences of the search term.
+
+## Error Handling
+
+### Common Issues
+
+#### Grep Command Not Found
+```
+RuntimeError: grep command not found. Please install grep or use semantic search.
+```
+
+**Solution**: Install grep on your system:
+- **Ubuntu/Debian**: `sudo apt-get install grep`
+- **macOS**: grep is pre-installed
+- **Windows**: Use WSL or install grep via Git Bash/MSYS2
+
+#### No Results Found
+```python
+# Check if your query exists in the raw data
+results = searcher.search("your_query", use_grep=True)
+if not results:
+ print("No exact matches found. Try:")
+ print("1. Check spelling and case")
+ print("2. Use partial terms")
+ print("3. Switch to semantic search")
+```
+
+## Complete Example
+
+```python
+#!/usr/bin/env python3
+"""
+Grep Search Example
+Demonstrates grep search for exact text matching.
+"""
+
+from leann.api import LeannSearcher
+
+def demonstrate_grep_search():
+ # Initialize searcher
+ searcher = LeannSearcher("my_index")
+
+ print("=== Function Search ===")
+ functions = searcher.search("def __init__", use_grep=True, top_k=5)
+ for i, result in enumerate(functions, 1):
+ print(f"{i}. Score: {result.score}")
+ print(f" Preview: {result.text[:60]}...")
+ print()
+
+ print("=== Error Search ===")
+ errors = searcher.search("FileNotFoundError", use_grep=True, top_k=3)
+ for result in errors:
+ print(f"Content: {result.text.strip()}")
+ print("-" * 40)
+
+if __name__ == "__main__":
+ demonstrate_grep_search()
+```
diff --git a/examples/grep_search_example.py b/examples/grep_search_example.py
new file mode 100644
index 0000000..71723ab
--- /dev/null
+++ b/examples/grep_search_example.py
@@ -0,0 +1,35 @@
+"""
+Grep Search Example
+
+Shows how to use grep-based text search instead of semantic search.
+Useful when you need exact text matches rather than meaning-based results.
+"""
+
+from leann import LeannSearcher
+
+# Load your index
+searcher = LeannSearcher("my-documents.leann")
+
+# Regular semantic search
+print("=== Semantic Search ===")
+results = searcher.search("machine learning algorithms", top_k=3)
+for result in results:
+ print(f"Score: {result.score:.3f}")
+ print(f"Text: {result.text[:80]}...")
+ print()
+
+# Grep-based search for exact text matches
+print("=== Grep Search ===")
+results = searcher.search("def train_model", top_k=3, use_grep=True)
+for result in results:
+ print(f"Score: {result.score}")
+ print(f"Text: {result.text[:80]}...")
+ print()
+
+# Find specific error messages
+error_results = searcher.search("FileNotFoundError", use_grep=True)
+print(f"Found {len(error_results)} files mentioning FileNotFoundError")
+
+# Search for function definitions
+func_results = searcher.search("class SearchResult", use_grep=True, top_k=5)
+print(f"Found {len(func_results)} class definitions")
diff --git a/llms.txt b/llms.txt
new file mode 100644
index 0000000..e470008
--- /dev/null
+++ b/llms.txt
@@ -0,0 +1,28 @@
+# llms.txt β LEANN MCP and Agent Integration
+product: LEANN
+homepage: https://github.com/yichuan-w/LEANN
+contact: https://github.com/yichuan-w/LEANN/issues
+
+# Installation
+install: uv tool install leann-core --with leann
+
+# MCP Server Entry Point
+mcp.server: leann_mcp
+mcp.protocol_version: 2024-11-05
+
+# Tools
+mcp.tools: leann_list, leann_search
+
+mcp.tool.leann_list.description: List available LEANN indexes
+mcp.tool.leann_list.input: {}
+
+mcp.tool.leann_search.description: Semantic search across a named LEANN index
+mcp.tool.leann_search.input.index_name: string, required
+mcp.tool.leann_search.input.query: string, required
+mcp.tool.leann_search.input.top_k: integer, optional, default=5, min=1, max=20
+mcp.tool.leann_search.input.complexity: integer, optional, default=32, min=16, max=128
+
+# Notes
+note: Build indexes with `leann build --docs ` before searching.
+example.add: claude mcp add --scope user leann-server -- leann_mcp
+example.verify: claude mcp list | cat
diff --git a/packages/astchunk-leann b/packages/astchunk-leann
new file mode 160000
index 0000000..a453701
--- /dev/null
+++ b/packages/astchunk-leann
@@ -0,0 +1 @@
+Subproject commit a4537018a329ba96f187b1d97c15abd1a04b8093
diff --git a/packages/leann-backend-diskann/pyproject.toml b/packages/leann-backend-diskann/pyproject.toml
index a98396a..07be0ac 100644
--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-diskann"
-version = "0.3.2"
-dependencies = ["leann-core==0.3.2", "numpy", "protobuf>=3.19.0"]
+version = "0.3.3"
+dependencies = ["leann-core==0.3.3", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build]
# Key: simplified CMake path
diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN
index c593831..19f9603 160000
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
@@ -1 +1 @@
-Subproject commit c593831474afb26bf167b077c2f0956ddbc54603
+Subproject commit 19f9603c728f51ff4a37df78805a3bb183e9870d
diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt
index 12e19ef..87d4592 100644
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -49,9 +49,28 @@ set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE)
set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
-# Disable additional SIMD versions to speed up compilation
+# Disable x86-specific SIMD optimizations (important for ARM64 compatibility)
set(FAISS_ENABLE_AVX2 OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_AVX512 OFF CACHE BOOL "" FORCE)
+set(FAISS_ENABLE_SSE4_1 OFF CACHE BOOL "" FORCE)
+
+# ARM64-specific configuration
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+ message(STATUS "Configuring Faiss for ARM64 architecture")
+
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ # Use SVE optimization level for ARM64 Linux (as seen in Faiss conda build)
+ set(FAISS_OPT_LEVEL "sve" CACHE STRING "" FORCE)
+ message(STATUS "Setting FAISS_OPT_LEVEL to 'sve' for ARM64 Linux")
+ else()
+ # Use generic optimization for other ARM64 platforms (like macOS)
+ set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
+ message(STATUS "Setting FAISS_OPT_LEVEL to 'generic' for ARM64 ${CMAKE_SYSTEM_NAME}")
+ endif()
+
+ # ARM64 compatibility: Faiss submodule has been modified to fix x86 header inclusion
+ message(STATUS "Using ARM64-compatible Faiss submodule")
+endif()
# Additional optimization options from INSTALL.md
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
diff --git a/packages/leann-backend-hnsw/pyproject.toml b/packages/leann-backend-hnsw/pyproject.toml
index 0543bb3..3456ac8 100644
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-hnsw"
-version = "0.3.2"
+version = "0.3.3"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [
- "leann-core==0.3.2",
+ "leann-core==0.3.3",
"numpy",
"pyzmq>=23.0.0",
"msgpack>=1.0.0",
diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss
index 4a2c0d6..ed96ff7 160000
--- a/packages/leann-backend-hnsw/third_party/faiss
+++ b/packages/leann-backend-hnsw/third_party/faiss
@@ -1 +1 @@
-Subproject commit 4a2c0d67d37a6f27c9a1cd695a3d703dcce73bad
+Subproject commit ed96ff7dbaea0562b994f8ce7823af41884b1010
diff --git a/packages/leann-core/pyproject.toml b/packages/leann-core/pyproject.toml
index c47aa90..82a65d9 100644
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann-core"
-version = "0.3.2"
+version = "0.3.3"
description = "Core API and plugin system for LEANN"
readme = "README.md"
requires-python = ">=3.9"
diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index 49f61a6..d808e66 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -6,6 +6,8 @@ with the correct, original embedding logic from the user's reference code.
import json
import logging
import pickle
+import re
+import subprocess
import time
import warnings
from dataclasses import dataclass, field
@@ -675,6 +677,7 @@ class LeannSearcher:
expected_zmq_port: int = 5557,
metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None,
batch_size: int = 0,
+ use_grep: bool = False,
**kwargs,
) -> list[SearchResult]:
"""
@@ -701,6 +704,10 @@ class LeannSearcher:
Returns:
List of SearchResult objects with text, metadata, and similarity scores
"""
+ # Handle grep search
+ if use_grep:
+ return self._grep_search(query, top_k)
+
logger.info("π LeannSearcher.search() called:")
logger.info(f" Query: '{query}'")
logger.info(f" Top_k: {top_k}")
@@ -817,9 +824,96 @@ class LeannSearcher:
logger.info(f" {GREEN}β Final enriched results: {len(enriched_results)} passages{RESET}")
return enriched_results
+ def _find_jsonl_file(self) -> Optional[str]:
+ """Find the .jsonl file containing raw passages for grep search"""
+ index_path = Path(self.meta_path_str).parent
+ potential_files = [
+ index_path / "documents.leann.passages.jsonl",
+ index_path.parent / "documents.leann.passages.jsonl",
+ ]
+
+ for file_path in potential_files:
+ if file_path.exists():
+ return str(file_path)
+ return None
+
+ def _grep_search(self, query: str, top_k: int = 5) -> list[SearchResult]:
+ """Perform grep-based search on raw passages"""
+ jsonl_file = self._find_jsonl_file()
+ if not jsonl_file:
+ raise FileNotFoundError("No .jsonl passages file found for grep search")
+
+ try:
+ cmd = ["grep", "-i", "-n", query, jsonl_file]
+ result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+
+ if result.returncode == 1:
+ return []
+ elif result.returncode != 0:
+ raise RuntimeError(f"Grep failed: {result.stderr}")
+
+ matches = []
+ for line in result.stdout.strip().split("\n"):
+ if not line:
+ continue
+ parts = line.split(":", 1)
+ if len(parts) != 2:
+ continue
+
+ try:
+ data = json.loads(parts[1])
+ text = data.get("text", "")
+ score = text.lower().count(query.lower())
+
+ matches.append(
+ SearchResult(
+ id=data.get("id", parts[0]),
+ text=text,
+ metadata=data.get("metadata", {}),
+ score=float(score),
+ )
+ )
+ except json.JSONDecodeError:
+ continue
+
+ matches.sort(key=lambda x: x.score, reverse=True)
+ return matches[:top_k]
+
+ except FileNotFoundError:
+ raise RuntimeError(
+ "grep command not found. Please install grep or use semantic search."
+ )
+
+ def _python_regex_search(self, query: str, top_k: int = 5) -> list[SearchResult]:
+ """Fallback regex search"""
+ jsonl_file = self._find_jsonl_file()
+ if not jsonl_file:
+ raise FileNotFoundError("No .jsonl file found")
+
+ pattern = re.compile(re.escape(query), re.IGNORECASE)
+ matches = []
+
+ with open(jsonl_file, encoding="utf-8") as f:
+ for line_num, line in enumerate(f, 1):
+ if pattern.search(line):
+ try:
+ data = json.loads(line.strip())
+ matches.append(
+ SearchResult(
+ id=data.get("id", str(line_num)),
+ text=data.get("text", ""),
+ metadata=data.get("metadata", {}),
+ score=float(len(pattern.findall(data.get("text", "")))),
+ )
+ )
+ except json.JSONDecodeError:
+ continue
+
+ matches.sort(key=lambda x: x.score, reverse=True)
+ return matches[:top_k]
+
def cleanup(self):
"""Explicitly cleanup embedding server resources.
-
This method should be called after you're done using the searcher,
especially in test environments or batch processing scenarios.
"""
@@ -875,6 +969,7 @@ class LeannChat:
expected_zmq_port: int = 5557,
metadata_filters: Optional[dict[str, dict[str, Union[str, int, float, bool, list]]]] = None,
batch_size: int = 0,
+ use_grep: bool = False,
**search_kwargs,
):
if llm_kwargs is None:
diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index 5a2611a..caad276 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -322,9 +322,17 @@ Examples:
return basic_matches
- def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
- """Check if a file should be excluded using gitignore parser."""
- return gitignore_matches(str(relative_path))
+ def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool:
+ """Check if a file should be excluded using gitignore parser.
+
+ Always match against absolute, posix-style paths for consistency with
+ gitignore_parser expectations.
+ """
+ try:
+ absolute_path = file_path.resolve()
+ except Exception:
+ absolute_path = Path(str(file_path))
+ return gitignore_matches(absolute_path.as_posix())
def _is_git_submodule(self, path: Path) -> bool:
"""Check if a path is a git submodule."""
@@ -396,7 +404,9 @@ Examples:
print(f" {current_path}")
print(" " + "β" * 45)
- current_indexes = self._discover_indexes_in_project(current_path)
+ current_indexes = self._discover_indexes_in_project(
+ current_path, exclude_dirs=other_projects
+ )
if current_indexes:
for idx in current_indexes:
total_indexes += 1
@@ -435,9 +445,14 @@ Examples:
print(" leann build my-docs --docs ./documents")
else:
# Count only projects that have at least one discoverable index
- projects_count = sum(
- 1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0
- )
+ projects_count = 0
+ for p in valid_projects:
+ if p == current_path:
+ discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects)
+ else:
+ discovered = self._discover_indexes_in_project(p)
+ if len(discovered) > 0:
+ projects_count += 1
print(f"π Total: {total_indexes} indexes across {projects_count} projects")
if current_indexes_count > 0:
@@ -454,9 +469,22 @@ Examples:
print("\nπ‘ Create your first index:")
print(" leann build my-docs --docs ./documents")
- def _discover_indexes_in_project(self, project_path: Path):
- """Discover all indexes in a project directory (both CLI and apps formats)"""
+ def _discover_indexes_in_project(
+ self, project_path: Path, exclude_dirs: Optional[list[Path]] = None
+ ):
+ """Discover all indexes in a project directory (both CLI and apps formats)
+
+ exclude_dirs: when provided, skip any APP-format index files that are
+ located under these directories. This prevents duplicates when the
+ current project is a parent directory of other registered projects.
+ """
indexes = []
+ exclude_dirs = exclude_dirs or []
+ # normalize to resolved paths once for comparison
+ try:
+ exclude_dirs_resolved = [p.resolve() for p in exclude_dirs]
+ except Exception:
+ exclude_dirs_resolved = exclude_dirs
# 1. CLI format: .leann/indexes/index_name/
cli_indexes_dir = project_path / ".leann" / "indexes"
@@ -495,6 +523,17 @@ Examples:
continue
except Exception:
pass
+ # Skip meta files that live under excluded directories
+ try:
+ meta_parent_resolved = meta_file.parent.resolve()
+ if any(
+ meta_parent_resolved.is_relative_to(ex_dir)
+ for ex_dir in exclude_dirs_resolved
+ ):
+ continue
+ except Exception:
+ # best effort; if resolve or comparison fails, do not exclude
+ pass
# Use the parent directory name as the app index display name
display_name = meta_file.parent.name
# Extract file base used to store files
@@ -1022,7 +1061,8 @@ Examples:
# Try to use better PDF parsers first, but only if PDFs are requested
documents = []
- docs_path = Path(docs_dir)
+ # Use resolved absolute paths to avoid mismatches (symlinks, relative vs absolute)
+ docs_path = Path(docs_dir).resolve()
# Check if we should process PDFs
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
@@ -1031,10 +1071,15 @@ Examples:
for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern
try:
+ # Ensure both paths are resolved before computing relativity
+ file_path_resolved = file_path.resolve()
+ # Determine directory scope using the non-resolved path to avoid
+ # misclassifying symlinked entries as outside the docs directory
relative_path = file_path.relative_to(docs_path)
if not include_hidden and _path_has_hidden_segment(relative_path):
continue
- if self._should_exclude_file(relative_path, gitignore_matches):
+ # Use absolute path for gitignore matching
+ if self._should_exclude_file(file_path_resolved, gitignore_matches):
continue
except ValueError:
# Skip files that can't be made relative to docs_path
@@ -1077,10 +1122,11 @@ Examples:
) -> bool:
"""Return True if file should be included (not excluded)"""
try:
- docs_path_obj = Path(docs_dir)
- file_path_obj = Path(file_path)
- relative_path = file_path_obj.relative_to(docs_path_obj)
- return not self._should_exclude_file(relative_path, gitignore_matches)
+ docs_path_obj = Path(docs_dir).resolve()
+ file_path_obj = Path(file_path).resolve()
+ # Use absolute path for gitignore matching
+ _ = file_path_obj.relative_to(docs_path_obj) # validate scope
+ return not self._should_exclude_file(file_path_obj, gitignore_matches)
except (ValueError, OSError):
return True # Include files that can't be processed
diff --git a/packages/leann-mcp/README.md b/packages/leann-mcp/README.md
index cf783d4..356065d 100644
--- a/packages/leann-mcp/README.md
+++ b/packages/leann-mcp/README.md
@@ -2,6 +2,8 @@
Transform your development workflow with intelligent code assistance using LEANN's semantic search directly in Claude Code.
+For agent-facing discovery details, see `llms.txt` in the repository root.
+
## Prerequisites
Install LEANN globally for MCP integration (with default backend):
diff --git a/packages/leann/pyproject.toml b/packages/leann/pyproject.toml
index 52d0bc9..41c54d0 100644
--- a/packages/leann/pyproject.toml
+++ b/packages/leann/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann"
-version = "0.3.2"
+version = "0.3.3"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md"
requires-python = ">=3.9"
diff --git a/pyproject.toml b/pyproject.toml
index 35e5613..a0d83bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,6 +100,7 @@ wechat-exporter = "wechat_exporter.main:main"
leann-core = { path = "packages/leann-core", editable = true }
leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = true }
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
+astchunk = { path = "packages/astchunk-leann", editable = true }
[tool.ruff]
target-version = "py39"
diff --git a/uv.lock b/uv.lock
index d01612b..28c5824 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
version = 1
-revision = 3
+revision = 2
requires-python = ">=3.9"
resolution-markers = [
"python_full_version >= '3.12'",
@@ -201,7 +201,7 @@ wheels = [
[[package]]
name = "astchunk"
version = "0.1.0"
-source = { registry = "https://pypi.org/simple" }
+source = { editable = "packages/astchunk-leann" }
dependencies = [
{ name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
@@ -214,10 +214,31 @@ dependencies = [
{ name = "tree-sitter-python" },
{ name = "tree-sitter-typescript" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/db/2a/7a35e2fac7d550265ae2ee40651425083b37555f921d1a1b77c3f525e0df/astchunk-0.1.0.tar.gz", hash = "sha256:f4dff0ef8b3b3bcfeac363384db1e153f74d4c825dc2e35864abfab027713be4", size = 18093, upload-time = "2025-06-19T04:37:25.34Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/be/84/5433ab0e933b572750cb16fd7edf3d6c7902b069461a22ec670042752a4d/astchunk-0.1.0-py3-none-any.whl", hash = "sha256:33ada9fc3620807fdda5846fa1948af463f281a60e0d43d4f3782b6dbb416d24", size = 15396, upload-time = "2025-06-19T04:37:23.87Z" },
+
+[package.metadata]
+requires-dist = [
+ { name = "black", marker = "extra == 'dev'", specifier = ">=22.0.0" },
+ { name = "flake8", marker = "extra == 'dev'", specifier = ">=5.0.0" },
+ { name = "isort", marker = "extra == 'dev'", specifier = ">=5.10.0" },
+ { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.0.0" },
+ { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=0.18.0" },
+ { name = "numpy", specifier = ">=1.20.0" },
+ { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=2.20.0" },
+ { name = "pyrsistent", specifier = ">=0.18.0" },
+ { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
+ { name = "pytest", marker = "extra == 'test'", specifier = ">=7.0.0" },
+ { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
+ { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.0.0" },
+ { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=2.5.0" },
+ { name = "sphinx", marker = "extra == 'docs'", specifier = ">=5.0.0" },
+ { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=1.0.0" },
+ { name = "tree-sitter", specifier = ">=0.20.0" },
+ { name = "tree-sitter-c-sharp", specifier = ">=0.20.0" },
+ { name = "tree-sitter-java", specifier = ">=0.20.0" },
+ { name = "tree-sitter-python", specifier = ">=0.20.0" },
+ { name = "tree-sitter-typescript", specifier = ">=0.20.0" },
]
+provides-extras = ["dev", "docs", "test"]
[[package]]
name = "asttokens"
@@ -2117,7 +2138,7 @@ wheels = [
[[package]]
name = "leann-backend-diskann"
-version = "0.3.2"
+version = "0.3.3"
source = { editable = "packages/leann-backend-diskann" }
dependencies = [
{ name = "leann-core" },
@@ -2129,14 +2150,14 @@ dependencies = [
[package.metadata]
requires-dist = [
- { name = "leann-core", specifier = "==0.3.2" },
+ { name = "leann-core", specifier = "==0.3.3" },
{ name = "numpy" },
{ name = "protobuf", specifier = ">=3.19.0" },
]
[[package]]
name = "leann-backend-hnsw"
-version = "0.3.2"
+version = "0.3.3"
source = { editable = "packages/leann-backend-hnsw" }
dependencies = [
{ name = "leann-core" },
@@ -2149,7 +2170,7 @@ dependencies = [
[package.metadata]
requires-dist = [
- { name = "leann-core", specifier = "==0.3.2" },
+ { name = "leann-core", specifier = "==0.3.3" },
{ name = "msgpack", specifier = ">=1.0.0" },
{ name = "numpy" },
{ name = "pyzmq", specifier = ">=23.0.0" },
@@ -2157,7 +2178,7 @@ requires-dist = [
[[package]]
name = "leann-core"
-version = "0.3.2"
+version = "0.3.3"
source = { editable = "packages/leann-core" }
dependencies = [
{ name = "accelerate" },
@@ -2298,7 +2319,7 @@ test = [
[package.metadata]
requires-dist = [
- { name = "astchunk", specifier = ">=0.1.0" },
+ { name = "astchunk", editable = "packages/astchunk-leann" },
{ name = "beautifulsoup4", marker = "extra == 'documents'", specifier = ">=4.13.0" },
{ name = "black", marker = "extra == 'dev'", specifier = ">=23.0" },
{ name = "boto3" },