Compare commits
26 Commits
feature/cl
...
feat/multi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2c3824e7b6 | ||
|
|
b2390ccc14 | ||
|
|
e8fca2c84a | ||
|
|
790ae14f69 | ||
|
|
ac363072e6 | ||
|
|
93465af46c | ||
|
|
792ece67dc | ||
|
|
239e35e2e6 | ||
|
|
2fac0c6fbf | ||
|
|
9801aa581b | ||
|
|
5e97916608 | ||
|
|
8b9c2be8c9 | ||
|
|
3ff5aac8e0 | ||
|
|
67fef60466 | ||
|
|
b6ab6f1993 | ||
|
|
9f2e82a838 | ||
|
|
0b2b799d5a | ||
|
|
0f790fbbd9 | ||
|
|
387ae21eba | ||
|
|
3cc329c3e7 | ||
|
|
5567302316 | ||
|
|
075d4bd167 | ||
|
|
e4bcc76f88 | ||
|
|
710e83b1fd | ||
|
|
c96d653072 | ||
|
|
8b22d2b5d3 |
85
.github/workflows/build-reusable.yml
vendored
85
.github/workflows/build-reusable.yml
vendored
@@ -54,16 +54,26 @@ jobs:
|
|||||||
python: '3.12'
|
python: '3.12'
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-22.04
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
- os: macos-latest
|
- os: macos-14
|
||||||
python: '3.9'
|
python: '3.9'
|
||||||
- os: macos-latest
|
- os: macos-14
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: macos-latest
|
- os: macos-14
|
||||||
python: '3.11'
|
python: '3.11'
|
||||||
- os: macos-latest
|
- os: macos-14
|
||||||
python: '3.12'
|
python: '3.12'
|
||||||
- os: macos-latest
|
- os: macos-14
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
|
- os: macos-13
|
||||||
|
python: '3.9'
|
||||||
|
- os: macos-13
|
||||||
|
python: '3.10'
|
||||||
|
- os: macos-13
|
||||||
|
python: '3.11'
|
||||||
|
- os: macos-13
|
||||||
|
python: '3.12'
|
||||||
|
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
|
||||||
|
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -109,48 +119,59 @@ jobs:
|
|||||||
uv pip install --system delocate
|
uv pip install --system delocate
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Set macOS environment variables
|
||||||
|
if: runner.os == 'macOS'
|
||||||
|
run: |
|
||||||
|
# Use brew --prefix to automatically detect Homebrew installation path
|
||||||
|
HOMEBREW_PREFIX=$(brew --prefix)
|
||||||
|
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||||
|
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
|
||||||
|
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
# Set compiler flags for OpenMP (required for both backends)
|
||||||
|
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
|
||||||
|
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Build packages
|
- name: Build packages
|
||||||
run: |
|
run: |
|
||||||
# Build core (platform independent)
|
# Build core (platform independent)
|
||||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
cd packages/leann-core
|
||||||
cd packages/leann-core
|
uv build
|
||||||
uv build
|
cd ../..
|
||||||
cd ../..
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Build HNSW backend
|
# Build HNSW backend
|
||||||
cd packages/leann-backend-hnsw
|
cd packages/leann-backend-hnsw
|
||||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||||
# Use system clang instead of homebrew LLVM for better compatibility
|
# Use system clang for better compatibility
|
||||||
export CC=clang
|
export CC=clang
|
||||||
export CXX=clang++
|
export CXX=clang++
|
||||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||||
uv build --wheel --python python
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
else
|
else
|
||||||
uv build --wheel --python python
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
fi
|
fi
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|
||||||
# Build DiskANN backend
|
# Build DiskANN backend
|
||||||
cd packages/leann-backend-diskann
|
cd packages/leann-backend-diskann
|
||||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||||
# Use system clang instead of homebrew LLVM for better compatibility
|
# Use system clang for better compatibility
|
||||||
export CC=clang
|
export CC=clang
|
||||||
export CXX=clang++
|
export CXX=clang++
|
||||||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||||
uv build --wheel --python python
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
else
|
else
|
||||||
uv build --wheel --python python
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
fi
|
fi
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|
||||||
# Build meta package (platform independent)
|
# Build meta package (platform independent)
|
||||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
cd packages/leann
|
||||||
cd packages/leann
|
uv build
|
||||||
uv build
|
cd ../..
|
||||||
cd ../..
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Repair wheels (Linux)
|
- name: Repair wheels (Linux)
|
||||||
if: runner.os == 'Linux'
|
if: runner.os == 'Linux'
|
||||||
@@ -199,20 +220,18 @@ jobs:
|
|||||||
echo "📦 Built packages:"
|
echo "📦 Built packages:"
|
||||||
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
||||||
|
|
||||||
|
|
||||||
- name: Install built packages for testing
|
- name: Install built packages for testing
|
||||||
run: |
|
run: |
|
||||||
# Create a virtual environment
|
# Create a virtual environment with the correct Python version
|
||||||
uv venv
|
uv venv --python ${{ matrix.python }}
|
||||||
source .venv/bin/activate || source .venv/Scripts/activate
|
source .venv/bin/activate || source .venv/Scripts/activate
|
||||||
|
|
||||||
# Install the built wheels
|
# Install packages using --find-links to prioritize local builds
|
||||||
# Use --find-links to let uv choose the correct wheel for the platform
|
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz
|
||||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl
|
||||||
uv pip install leann-core --find-links packages/leann-core/dist
|
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl
|
||||||
uv pip install leann --find-links packages/leann/dist
|
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz
|
||||||
fi
|
|
||||||
uv pip install leann-backend-hnsw --find-links packages/leann-backend-hnsw/dist
|
|
||||||
uv pip install leann-backend-diskann --find-links packages/leann-backend-diskann/dist
|
|
||||||
|
|
||||||
# Install test dependencies using extras
|
# Install test dependencies using extras
|
||||||
uv pip install -e ".[test]"
|
uv pip install -e ".[test]"
|
||||||
|
|||||||
49
README.md
49
README.md
@@ -3,9 +3,11 @@
|
|||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://img.shields.io/badge/Python-3.9%2B-blue.svg" alt="Python 3.9+">
|
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
|
||||||
|
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
|
||||||
|
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
|
||||||
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
|
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
|
||||||
<img src="https://img.shields.io/badge/Platform-Linux%20%7C%20macOS-lightgrey" alt="Platform">
|
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<h2 align="center" tabindex="-1" class="heading-element" dir="auto">
|
<h2 align="center" tabindex="-1" class="heading-element" dir="auto">
|
||||||
@@ -16,7 +18,10 @@ LEANN is an innovative vector database that democratizes personal AI. Transform
|
|||||||
|
|
||||||
LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
|
LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
|
||||||
|
|
||||||
**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can search your **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)**, or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
|
**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can semantic search your **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)**, **[codebase](#-claude-code-integration-transform-your-development-workflow)**\* , or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
|
||||||
|
|
||||||
|
|
||||||
|
\* Claude Code only supports basic `grep`-style keyword search. **LEANN** is a drop-in **semantic search MCP service fully compatible with Claude Code**, unlocking intelligent retrieval without changing your workflow. 🔥 Check out [the easy setup →](packages/leann-mcp/README.md)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -26,7 +31,7 @@ LEANN achieves this through *graph-based selective recomputation* with *high-deg
|
|||||||
<img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="70%">
|
<img src="assets/effects.png" alt="LEANN vs Traditional Vector DB Storage Comparison" width="70%">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
> **The numbers speak for themselves:** Index 60 million Wikipedia chunks in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks for different applications below ↓](#storage-comparison)
|
> **The numbers speak for themselves:** Index 60 million text chunks in just 6GB instead of 201GB. From emails to browser history, everything fits on your laptop. [See detailed benchmarks for different applications below ↓](#storage-comparison)
|
||||||
|
|
||||||
|
|
||||||
🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service".
|
🔒 **Privacy:** Your data never leaves your laptop. No OpenAI, no cloud, no "terms of service".
|
||||||
@@ -185,8 +190,8 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
|
|||||||
--force-rebuild # Force rebuild index even if it exists
|
--force-rebuild # Force rebuild index even if it exists
|
||||||
|
|
||||||
# Embedding Parameters
|
# Embedding Parameters
|
||||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small or mlx-community/multilingual-e5-base-mlx
|
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text,mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
|
||||||
--embedding-mode MODE # sentence-transformers, openai, or mlx
|
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
|
||||||
|
|
||||||
# LLM Parameters (Text generation models)
|
# LLM Parameters (Text generation models)
|
||||||
--llm TYPE # LLM backend: openai, ollama, or hf (default: openai)
|
--llm TYPE # LLM backend: openai, ollama, or hf (default: openai)
|
||||||
@@ -219,7 +224,7 @@ Ask questions directly about your personal PDFs, documents, and any directory co
|
|||||||
<img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
|
<img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
The example below asks a question about summarizing our paper (uses default data in `data/`, which is a directory with diverse data sources: two papers, Pride and Prejudice, and a README in Chinese) and this is the **easiest example** to run here:
|
The example below asks a question about summarizing our paper (uses default data in `data/`, which is a directory with diverse data sources: two papers, Pride and Prejudice, and a Technical report about LLM in Huawei in Chinese), and this is the **easiest example** to run here:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
source .venv/bin/activate # Don't forget to activate the virtual environment
|
source .venv/bin/activate # Don't forget to activate the virtual environment
|
||||||
@@ -414,7 +419,26 @@ Once the index is built, you can ask questions like:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
### 🚀 Claude Code Integration: Transform Your Development Workflow!
|
||||||
|
|
||||||
|
**The future of code assistance is here.** Transform your development workflow with LEANN's native MCP integration for Claude Code. Index your entire codebase and get intelligent code assistance directly in your IDE.
|
||||||
|
|
||||||
|
**Key features:**
|
||||||
|
- 🔍 **Semantic code search** across your entire project
|
||||||
|
- 📚 **Context-aware assistance** for debugging and development
|
||||||
|
- 🚀 **Zero-config setup** with automatic language detection
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install LEANN globally for MCP integration
|
||||||
|
uv tool install leann-core
|
||||||
|
|
||||||
|
# Setup is automatic - just start using Claude Code!
|
||||||
|
```
|
||||||
|
Try our fully agentic pipeline with auto query rewriting, semantic search planning, and more:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
**Ready to supercharge your coding?** [Complete Setup Guide →](packages/leann-mcp/README.md)
|
||||||
|
|
||||||
## 🖥️ Command Line Interface
|
## 🖥️ Command Line Interface
|
||||||
|
|
||||||
@@ -428,7 +452,7 @@ source .venv/bin/activate
|
|||||||
leann --help
|
leann --help
|
||||||
```
|
```
|
||||||
|
|
||||||
**To make it globally available (recommended for daily use):**
|
**To make it globally available:**
|
||||||
```bash
|
```bash
|
||||||
# Install the LEANN CLI globally using uv tool
|
# Install the LEANN CLI globally using uv tool
|
||||||
uv tool install leann
|
uv tool install leann
|
||||||
@@ -437,13 +461,15 @@ uv tool install leann
|
|||||||
leann --help
|
leann --help
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> **Note**: Global installation is required for Claude Code integration. The `leann_mcp` server depends on the globally available `leann` command.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Usage Examples
|
### Usage Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Build an index from documents
|
# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
|
||||||
leann build my-docs --docs ./documents
|
leann build my-docs --docs ./your_documents
|
||||||
|
|
||||||
# Search your documents
|
# Search your documents
|
||||||
leann search my-docs "machine learning concepts"
|
leann search my-docs "machine learning concepts"
|
||||||
@@ -581,8 +607,9 @@ We welcome more contributors! Feel free to open issues or submit PRs.
|
|||||||
|
|
||||||
This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.edu/).
|
This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.edu/).
|
||||||
|
|
||||||
---
|
## Star History
|
||||||
|
|
||||||
|
[](https://www.star-history.com/#yichuan-w/LEANN&Date)
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<strong>⭐ Star us on GitHub if Leann is useful for your research or applications!</strong>
|
<strong>⭐ Star us on GitHub if Leann is useful for your research or applications!</strong>
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ class BaseRAGExample(ABC):
|
|||||||
"--embedding-mode",
|
"--embedding-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="sentence-transformers",
|
default="sentence-transformers",
|
||||||
choices=["sentence-transformers", "openai", "mlx"],
|
choices=["sentence-transformers", "openai", "mlx", "ollama"],
|
||||||
help="Embedding backend mode (default: sentence-transformers)",
|
help="Embedding backend mode (default: sentence-transformers)",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -85,7 +85,7 @@ class BaseRAGExample(ABC):
|
|||||||
"--llm",
|
"--llm",
|
||||||
type=str,
|
type=str,
|
||||||
default="openai",
|
default="openai",
|
||||||
choices=["openai", "ollama", "hf"],
|
choices=["openai", "ollama", "hf", "simulated"],
|
||||||
help="LLM backend to use (default: openai)",
|
help="LLM backend to use (default: openai)",
|
||||||
)
|
)
|
||||||
llm_group.add_argument(
|
llm_group.add_argument(
|
||||||
|
|||||||
BIN
assets/claude_code_leann.png
Normal file
BIN
assets/claude_code_leann.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 73 KiB |
BIN
assets/mcp_leann.png
Normal file
BIN
assets/mcp_leann.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 224 KiB |
82
data/huawei_pangu.md
Normal file
82
data/huawei_pangu.md
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
# 盘古之殇:华为诺亚盘古大模型研发历程的心酸与黑暗
|
||||||
|
|
||||||
|
各位好,
|
||||||
|
|
||||||
|
我是一名盘古大模型团队,华为诺亚方舟实验室的员工。
|
||||||
|
|
||||||
|
首先为自证身份,列举一些细节:
|
||||||
|
|
||||||
|
1. 现诺亚主任,前算法应用部部长,后改名为小模型实验室的主任王云鹤。前诺亚主任:姚骏(大家称姚老师)。几个实验室主任:唐睿明(明哥,明队,已离职),尚利峰,张维(维哥),郝建业(郝老师),刘武龙(称呼为武龙所)等。其他骨干成员和专家陆续有很多人离职。
|
||||||
|
2. 我们隶属于“四野”这个组织。四野下属有许多纵队,基础语言大模型是四纵。王云鹤的小模型是十六纵队。我们参加过苏州的集结,有各种月份的时间节点。在苏州攻关会颁发任务令,需要在节点前达成目标。苏州集结会把各地的人员都集中在苏州研究所,平常住宾馆,比如在甪直的酒店,与家人孩子天各一方。
|
||||||
|
3. 在苏州集结的时候周六默认上班,非常辛苦,不过周六有下午茶,有一次还有小龙虾。在苏州研究所的工位搬迁过一次,从一栋楼换到了另一栋。苏州研究所楼栋都是欧式装修,门口有大坡,里面景色很不错。去苏州集结一般至少要去一周,甚至更久,多的人甚至一两个月都回不了家。
|
||||||
|
4. 诺亚曾经传说是研究型的,但是来了之后因为在四野做大模型项目,项目成员完全变成了交付型的,且充满了例会,评审,汇报。很多时候做实验都要申请。团队需要对接终端小艺,华为云,ICT等诸多业务线,交付压力不小。
|
||||||
|
5. 诺亚研发的盘古模型早期内部代号叫做“盘古智子”,一开始只有内部需要申请试用的网页版,到后续迫于压力在welink上接入和公测开放。
|
||||||
|
|
||||||
|
这些天发生关于质疑盘古大模型抄袭千问的事情闹的沸沸扬扬。作为一个盘古团队的成员,我最近夜夜辗转反侧,难以入眠。盘古的品牌受到如此大的影响,一方面,我自私的为我的职业发展担忧,也为自己过去的努力工作感到不值。另一方面,由于有人开始揭露这些事情我内心又感到大快人心。在多少个日日夜夜,我们对内部某些人一次次靠着造假而又获得了无数利益的行为咬牙切齿而又无能为力。这种压抑和羞辱也逐渐消磨了我对华为的感情,让我在这里的时日逐渐浑浑噩噩,迷茫无措,时常怀疑自己的人生和自我价值。
|
||||||
|
|
||||||
|
我承认我是一个懦弱的人,作为一个小小的打工人,我不仅不敢和王云鹤等内部手眼通天的人做对,更不敢和华为这样的庞然大物做对。我很怕失去我的工作,毕竟我也有家人和孩子,所以我打心眼里很佩服揭露者。但是,看到内部还在试图洗地掩盖事实,蒙蔽公众的时候,我实在不能容忍了。我也希望勇敢一次,顺从自己本心。就算自损八百,我也希望能伤敌一千。我决定把我在这里的所见所闻(部分来自于同事口述)公布出来,关于盘古大模型的“传奇故事”:
|
||||||
|
|
||||||
|
华为确实主要在昇腾卡上训练大模型(小模型实验室有不少英伟达的卡,他们之前也会用来训练,后面转移到昇腾)。曾经我被华为“打造世界第二选择”的决心而折服,我本身也曾经对华为有深厚的感情。我们陪着昇腾一步步摸爬滚打,从充满bug到现在能训出模型,付出了巨大的心血和代价。
|
||||||
|
|
||||||
|
最初我们的算力非常有限,在910A上训练模型。那会只支持fp16,训练的稳定性远不如bf16。盘古的moe开始很早,23年就主要是训练38Bmoe模型和后续的71B dense模型。71B的dense模型通过扩增变成了第一代的135Bdense模型,后面主力模型也逐渐在910B上训练。
|
||||||
|
|
||||||
|
71B和135B模型都有一个巨大的硬伤就是tokenizer。当时使用的tokenizer编码效率极低,每个单个的符号,数字,空格,乃至汉字都会占用一个token。可想而知这会非常浪费算力,且使得模型的效果很差。这时候小模型实验室正好有个自己训的词表。姚老师当时怀疑是不是模型的tokenizer不好(虽然事后来看,他的怀疑是无疑正确的),于是就决定,让71B和135B换tokenizer,因为小模型实验室曾经尝试过。团队缝合了两个tokenizer,开始了tokenizer的更换。71B模型的更换失败了,而135B因为采用了更精细的embedding初始化策略,续训了至少1T的数据后词表总算更换成功,但可想而知,效果并不会变好。
|
||||||
|
|
||||||
|
于此同期,阿里和智谱等国内其他公司在GPU上训练,且已经摸索出了正确的方法,盘古和竞品的差距越来越大。内部一个230B从头训练的dense模型又因为各种原因训练失败,导致项目的状况几乎陷入绝境。面临几个节点的压力以及内部对盘古的强烈质疑时,团队的士气低迷到了极点。团队在算力极其有限的时候,做出了很多努力和挣扎。比如,团队偶然发现当时的38B moe并没有预期moe的效果。于是去掉了moe参数,还原为了13B的dense模型。由于38B的moe源自很早的pangu alpha 13B,架构相对落后,团队进行了一系列的操作,比如切换绝对位置编码到rope,去掉bias,切换为rmsnorm。同时鉴于tokenizer的一些失败和换词表的经验,这个模型的词表也更换为了王云鹤的小模型实验室7B模型所使用的词表。后面这个13B模型进行了扩增续训,变成了第二代38B dense模型(在几个月内这个模型都是主要的盘古中档位模型),曾经具有一定的竞争力。但是,由于更大的135B模型架构落后,且更换词表模型损伤巨大(后续分析发现当时更换的缝合词表有更严重的bug),续训后也与千问等当时国内领先模型存在很大差距。这时由于内部的质疑声和领导的压力也越来越大。团队的状态几乎陷入了绝境。
|
||||||
|
|
||||||
|
在这种情况下,王云鹤和他的小模型实验室出手了。他们声称是从旧的135B参数继承改造而来,通过训练短短的几百B数据,各项指标平均提升了十个点左右。实际上,这就是他们套壳应用到大模型的第一次杰作。华为的外行领导内行,使得领导完全对于这种扯淡的事情没有概念,他们只会觉得肯定是有什么算法创新。经过内部的分析,他们实际上是使用Qwen 1.5 110B续训而来,通过加层,扩增ffn维度,添加盘古pi论文的一些机制得来,凑够了大概135B的参数。实际上,旧的135B有107层,而这个模型只有82层,各种配置也都不一样。新的来路不明的135B训练完很多参数的分布也和Qwen 110B几乎一模一样。连模型代码的类名当时都是Qwen,甚至懒得改名。后续这个模型就是所谓的135B V2。而这个模型当时也提供给了很多下游,甚至包括外部客户。
|
||||||
|
|
||||||
|
这件事对于我们这些认真诚实做事的同事们带来了巨大的冲击,内部很多人其实都知道这件事,甚至包括终端和华为云。我们都戏称以后别叫盘古模型了,叫千古吧。当时团队成员就想向bcg举报了,毕竟这已经是重大的业务造假了。但是后面据说被领导拦了下来,因为更高级别的领导(比如姚老师,以及可能熊总和查老)其实后面也知道了,但是并不管,因为通过套壳拿出好的结果,对他们也是有利的。这件事使得当时团队几位最强的同事开始心灰意冷,离职跑路也逐渐成为挂在嘴边的事。
|
||||||
|
|
||||||
|
此时,盘古似乎迎来了转机。由于前面所述的这些盘古模型基本都是续训和改造而来,当时诺亚完全没有掌握从头训练的技术,何况还是在昇腾的NPU上进行训练。在当时团队的核心成员的极力争取下,盘古开始了第三代模型的训练,付出了巨大的努力后,在数据架构和训练算法方面都与业界逐渐接轨,而这其中的艰辛和小模型实验室的人一点关系都没有。
|
||||||
|
|
||||||
|
一开始团队成员毫无信心,只从一个13B的模型开始训练,但是后面发现效果还不错,于是这个模型后续再次进行了一次参数扩增,变成了第三代的38B,代号38B V3。想必很多产品线的兄弟都对这个模型很熟悉。当时这个模型的tokenizer是基于llama的词表进行扩展的(也是业界常见的做法)。而当时王云鹤的实验室做出来了另一个词表(也就是后续pangu系列的词表)。当时两个词表还被迫进行了一次赛马,最终没有明显的好坏结论。于是,领导当即决定,应该统一词表,使用王云鹤他们的。于是,在后续从头训练的135B V3(也就是对外的Pangu Ultra),便是采用了这个tokenizer。这也解释了很多使用我们模型的兄弟的疑惑,为什么当时同为V3代的两个不同档位的模型,会使用不同的tokenizer。
|
||||||
|
|
||||||
|
|
||||||
|
我们打心眼里觉得,135B V3是我们四纵团队当时的骄傲。这是第一个真正意义上的,华为全栈自研,正经从头训练的千亿级别的模型,且效果与24年同期竞品可比的。写到这里我已经热泪盈眶,太不容易了。当时为了稳定训练,团队做了大量实验对比,并且多次在模型梯度出现异常的时候进行及时回退重启。这个模型真正做到了后面技术报告所说的训练全程没有一个loss spike。我们克服了不知道多少困难,我们做到了,我们愿用生命和荣誉保证这个模型训练的真实性。多少个凌晨,我们为了它的训练而不眠。在被内部心声骂的一文不值的时候,我们有多么不甘,有多少的委屈,我们挺住了。
|
||||||
|
|
||||||
|
我们这帮人是真的在为打磨国产算力底座燃烧自己的青春啊……客居他乡,我们放弃了家庭,放弃了假期,放弃了健康,放弃了娱乐,抛头颅洒热血,其中的艰辛与困苦,寥寥数笔不足以概括其万一。在各种动员大会上,当时口号中喊出的盘古必胜,华为必胜,我们心里是真的深深被感动。
|
||||||
|
|
||||||
|
然而,我们的所有辛苦的成果,经常被小模型实验室轻飘飘的拿走了。数据,直接要走。代码,直接要走,还要求我们配合适配到能一键运行。我们当时戏称小模型实验室为点鼠标实验室。我们付出辛苦,他们取得荣耀。果然应了那句话,你在负重前行是因为有人替你岁月静好。在这种情况下,越来越多的战友再也坚持不下去了,选择了离开。看到身边那些优秀的同事一个个离职,我的内心又感叹又难过。在这种作战一样的环境下,我们比起同事来说更像是战友。他们在技术上也有无数值得我学习的地方,堪称良师。看到他们去了诸如字节Seed,Deepseek,月之暗面,腾讯和快手等等很多出色的团队,我打心眼里为他们高兴和祝福,脱离了这个辛苦却肮脏的地方。我至今还对一位离职同事的话记忆犹新,ta说:“来这里是我技术生涯中的耻辱,在这里再呆每一天都是浪费生命”。话虽难听却让我无言以对。我担心我自己技术方面的积累不足,以及没法适应互联网公司高淘汰的环境,让我多次想离职的心始终没有迈出这一步。
|
||||||
|
|
||||||
|
盘古除了dense模型,后续也启动了moe的探索。一开始训练的是一个224B的moe模型。而与之平行的,小模型实验室也开启了第二次主要的套壳行动(次要的插曲可能还包括一些别的模型,比如math模型),即这次流传甚广的pangu pro moe 72B。这个模型内部自称是从小模型实验室的7B扩增上来的(就算如此,这也与技术报告不符,何况是套壳qwen 2.5的14b续训)。还记得他们训了没几天,内部的评测就立刻追上了当时的38B V3。AI系统实验室很多兄弟因为需要适配模型,都知道他们的套壳行动,只是迫于各种原因,无法伸张正义。实际上,对于后续训了很久很久的这个模型,Honestagi能够分析出这个量级的相似性我已经很诧异了,因为这个模型为了续训洗参数,所付出的算力甚至早就足够从头训一个同档位的模型了。听同事说他们为了洗掉千问的水印,采取了不少办法,甚至包括故意训了脏数据。这也为学术界研究模型血缘提供了一个前所未有的特殊模范吧。以后新的血缘方法提出可以拿出来溜溜。
|
||||||
|
|
||||||
|
24年底和25年初,在Deepseek v3和r1发布之后,由于其惊艳的技术水平,团队受到了巨大的冲击,也受到了更大的质疑。于是为了紧跟潮流,盘古模仿Deepseek的模型尺寸,开启了718B moe的训练。这个时候,小模型实验室再次出手了。他们选择了套壳Deepseekv3续训。他们通过冻住Deepseek加载的参数,进行训练。连任务加载ckpt的目录都是deepseekv3,改都不改,何其嚣张?与之相反,一些有真正技术信仰的同事,在从头训练另一个718B的moe。但其中出现了各种各样的问题。但是很显然,这个模型怎么可能比直接套壳的好呢?如果不是团队leader坚持,早就被叫停了。
|
||||||
|
|
||||||
|
华为的流程管理之繁重,严重拖累了大模型的研发节奏,例如版本管理,模型血缘,各种流程化,各种可追溯。讽刺的是,小模型实验室的模型似乎从来不受这些流程的约束,想套壳就套壳,想续训就续训,算力源源不断的伸手拿走。这种强烈到近乎魔幻的对比,说明了当前流程管理的情况:只许州官放火,不许百姓点灯。何其可笑?何其可悲?何其可恶?何其可耻!
|
||||||
|
|
||||||
|
HonestAGI的事情出来后,内部让大家不停的研讨分析,如何公关和“回应”。诚然,这个原文的分析也许不够有力,给了王云鹤与小模型实验室他们狡辩和颠倒黑白的机会。为此,这两天我内心感到作呕,时时怀疑自己的人生意义以及苍天无眼。我不奉陪了,我要离职了,同时我也在申请从盘古部分技术报告的作者名单中移除。曾经在这些技术报告上署名是我一生都无法抹除的污点。当时我没想到,他们竟然猖狂到敢开源。我没想到,他们敢如此愚弄世人,大肆宣发。当时,我也许是存了侥幸心理,没有拒绝署名。我相信很多扎实做事的战友,也只是被迫上了贼船,或者不知情。但这件事已经无法挽回,我希望我的余生能够坚持扎实做真正有意义的事,为我当时的软弱和不坚定赎罪。
|
||||||
|
|
||||||
|
深夜写到这里,我已经泪流满面,泣不成声。还记得一些出色的同事离职时,我苦笑问他们要不要发个长长的心声惯例帖,揭露一下现状。对方说:不了,浪费时间,而且我也怕揭露出来你们过的更糟。我当时一下黯然神伤,因为曾经共同为了理想奋斗过的战友已经彻底对华为彻底灰心了。当时大家调侃,我们用着当年共产党的小米加步枪,组织却有着堪比当年国民党的作风。
|
||||||
|
|
||||||
|
曾几何时,我为我们用着小米加步枪打败洋枪洋炮而自豪。
|
||||||
|
|
||||||
|
现在,我累了,我想投降。
|
||||||
|
|
||||||
|
其实时至今日,我还是真心希望华为能认真吸取教训,能做好盘古,把盘古做到世界一流,把昇腾变成英伟达的水平。内部的劣币驱逐良币,使得诺亚乃至华为在短时间内急剧流失了大量出色的大模型人才。相信他们也正在如Deepseek等各个团队闪耀着,施展着他们的抱负才华,为中美在AI的激烈竞赛中奉献力量。我时常感叹,华为不是没有人才,而是根本不知道怎么留住人才。如果给这些人合适的环境,合适的资源,更少的枷锁,更少的政治斗争,盘古何愁不成?
|
||||||
|
|
||||||
|
最后:我以生命,人格和荣誉发誓,我写的以上所有内容均为真实(至少在我有限的认知范围内)。我没有那么高的技术水平以及机会去做详尽扎实的分析,也不敢直接用内部记录举证,怕因为信息安全抓到。但是我相信我很多曾经的战友,会为我作证。在华为内部的兄弟,包括我们曾经服务过的产品线兄弟们,相信本文的无数细节能和你们的印象对照,印证我的说法。你们可能也曾经被蒙骗,但这些残酷的真相不会被尘封。我们奋战过的痕迹,也不应该被扭曲和埋葬。
|
||||||
|
|
||||||
|
写了这么多,某些人肯定想把我找出来,抹杀掉。公司搞不好也想让我噤声乃至追责。如果真的这样,我,乃至我的家人的人身乃至生命安全可能都会受到威胁。为了自我保护,我近期每天会跟大家报平安。
|
||||||
|
|
||||||
|
如果我消失了,就当是我为了真理和理想,为了华为乃至中国能够更好地发展算力和AI而牺牲了吧,我愿埋葬于那片曾经奋斗过的地方。
|
||||||
|
|
||||||
|
诺亚,再见
|
||||||
|
|
||||||
|
2025年7月6日凌晨 写于深圳
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
各位好,
|
||||||
|
|
||||||
|
感谢大家的关心与祝福。我目前暂时安全,但公司应该在进行排查与某些名单收集,后续情况未知。
|
||||||
|
|
||||||
|
我补充一些细节,以免某些人继续颠倒黑白。
|
||||||
|
|
||||||
|
关于135B V2,小模型实验室在迅速地完成套壳并拿完所有套壳带来的好处后(比如任务令表彰和及时激励),因为不想继续支撑下游应用和模型迭代,又把这个烫手山芋甩给了四纵。确实技高一筹,直接把四纵的兄弟们拉下水。同事提供过去一个老旧的模型,最终拿回了一个当时一个魔改的先进的千问。做大模型的人,自己做的模型就像自己孩子一样熟悉,不要把别人都当傻子。就像自家儿子出门一趟,回来个别人家孩子。
|
||||||
|
|
||||||
|
盘古report的署名是不符合学术规范的。例如,135B V3有不少有技术贡献的人,因为作者名额数量限制,劳动成果没有得到应有的回报,团队内曾经有不小的意见。这个模型当时是大家智慧和汗水的结晶,甚至是团队当时的精神支柱,支撑着不少兄弟们继续留在诺亚。所谓的名额限制,以及挂名了一些毫无技术贡献的人(如一些小模型实验室的人),让兄弟们何其心寒。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
暂时平安。另外,支持我勇于说出真相的战友们 https://github.com/HW-whistleblower/True-Story-of-Pangu/issues/317
|
||||||
@@ -49,14 +49,25 @@ Based on our experience developing LEANN, embedding models fall into three categ
|
|||||||
- **Cons**: Slower inference, longer index build times
|
- **Cons**: Slower inference, longer index build times
|
||||||
- **Use when**: Quality is paramount and you have sufficient compute resources. **Highly recommended** for production use
|
- **Use when**: Quality is paramount and you have sufficient compute resources. **Highly recommended** for production use
|
||||||
|
|
||||||
### Quick Start: OpenAI Embeddings (Fastest Setup)
|
### Quick Start: Cloud and Local Embedding Options
|
||||||
|
|
||||||
|
**OpenAI Embeddings (Fastest Setup)**
|
||||||
For immediate testing without local model downloads:
|
For immediate testing without local model downloads:
|
||||||
```bash
|
```bash
|
||||||
# Set OpenAI embeddings (requires OPENAI_API_KEY)
|
# Set OpenAI embeddings (requires OPENAI_API_KEY)
|
||||||
--embedding-mode openai --embedding-model text-embedding-3-small
|
--embedding-mode openai --embedding-model text-embedding-3-small
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Ollama Embeddings (Privacy-Focused)**
|
||||||
|
For local embeddings with complete privacy:
|
||||||
|
```bash
|
||||||
|
# First, pull an embedding model
|
||||||
|
ollama pull nomic-embed-text
|
||||||
|
|
||||||
|
# Use Ollama embeddings
|
||||||
|
--embedding-mode ollama --embedding-model nomic-embed-text
|
||||||
|
```
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Cloud vs Local Trade-offs</strong></summary>
|
<summary><strong>Cloud vs Local Trade-offs</strong></summary>
|
||||||
|
|
||||||
@@ -211,9 +222,15 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores
|
|||||||
|
|
||||||
3. **Use MLX on Apple Silicon** (optional optimization):
|
3. **Use MLX on Apple Silicon** (optional optimization):
|
||||||
```bash
|
```bash
|
||||||
--embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx
|
--embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit
|
||||||
```
|
```
|
||||||
|
MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation
|
||||||
|
|
||||||
|
4. **Use Ollama**
|
||||||
|
```bash
|
||||||
|
--embedding-mode ollama --embedding-model nomic-embed-text
|
||||||
|
```
|
||||||
|
To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
|
||||||
### If Search Quality is Poor
|
### If Search Quality is Poor
|
||||||
|
|
||||||
1. **Increase retrieval count**:
|
1. **Increase retrieval count**:
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
# packages/leann-backend-diskann/CMakeLists.txt (simplified version)
|
|
||||||
|
|
||||||
cmake_minimum_required(VERSION 3.20)
|
|
||||||
project(leann_backend_diskann_wrapper)
|
|
||||||
|
|
||||||
# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
|
|
||||||
# DiskANN will handle everything itself, including compiling Python bindings
|
|
||||||
add_subdirectory(src/third_party/DiskANN)
|
|
||||||
@@ -4,7 +4,7 @@ import os
|
|||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Literal
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import psutil
|
import psutil
|
||||||
@@ -259,7 +259,7 @@ class DiskannSearcher(BaseSearcher):
|
|||||||
prune_ratio: float = 0.0,
|
prune_ratio: float = 0.0,
|
||||||
recompute_embeddings: bool = False,
|
recompute_embeddings: bool = False,
|
||||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||||
zmq_port: int | None = None,
|
zmq_port: Optional[int] = None,
|
||||||
batch_recompute: bool = False,
|
batch_recompute: bool = False,
|
||||||
dedup_node_dis: bool = False,
|
dedup_node_dis: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import zmq
|
import zmq
|
||||||
@@ -32,7 +33,7 @@ if not logger.handlers:
|
|||||||
|
|
||||||
|
|
||||||
def create_diskann_embedding_server(
|
def create_diskann_embedding_server(
|
||||||
passages_file: str | None = None,
|
passages_file: Optional[str] = None,
|
||||||
zmq_port: int = 5555,
|
zmq_port: int = 5555,
|
||||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||||
embedding_mode: str = "sentence-transformers",
|
embedding_mode: str = "sentence-transformers",
|
||||||
@@ -261,7 +262,7 @@ if __name__ == "__main__":
|
|||||||
"--embedding-mode",
|
"--embedding-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="sentence-transformers",
|
default="sentence-transformers",
|
||||||
choices=["sentence-transformers", "openai", "mlx"],
|
choices=["sentence-transformers", "openai", "mlx", "ollama"],
|
||||||
help="Embedding backend mode",
|
help="Embedding backend mode",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-backend-diskann"
|
name = "leann-backend-diskann"
|
||||||
version = "0.2.1"
|
version = "0.2.7"
|
||||||
dependencies = ["leann-core==0.2.1", "numpy", "protobuf>=3.19.0"]
|
dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]
|
||||||
|
|
||||||
[tool.scikit-build]
|
[tool.scikit-build]
|
||||||
# Key: simplified CMake path
|
# Key: simplified CMake path
|
||||||
@@ -17,3 +17,5 @@ editable.mode = "redirect"
|
|||||||
cmake.build-type = "Release"
|
cmake.build-type = "Release"
|
||||||
build.verbose = true
|
build.verbose = true
|
||||||
build.tool-args = ["-j8"]
|
build.tool-args = ["-j8"]
|
||||||
|
# Let CMake find packages via Homebrew prefix
|
||||||
|
cmake.define = {CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}, OpenMP_ROOT = {env = "OpenMP_ROOT"}}
|
||||||
|
|||||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: af2a26481e...04048bb302
@@ -5,11 +5,20 @@ set(CMAKE_CXX_COMPILER_WORKS 1)
|
|||||||
|
|
||||||
# Set OpenMP path for macOS
|
# Set OpenMP path for macOS
|
||||||
if(APPLE)
|
if(APPLE)
|
||||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
# Detect Homebrew installation path (Apple Silicon vs Intel)
|
||||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
if(EXISTS "/opt/homebrew/opt/libomp")
|
||||||
|
set(HOMEBREW_PREFIX "/opt/homebrew")
|
||||||
|
elseif(EXISTS "/usr/local/opt/libomp")
|
||||||
|
set(HOMEBREW_PREFIX "/usr/local")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||||
|
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||||
set(OpenMP_C_LIB_NAMES "omp")
|
set(OpenMP_C_LIB_NAMES "omp")
|
||||||
set(OpenMP_CXX_LIB_NAMES "omp")
|
set(OpenMP_CXX_LIB_NAMES "omp")
|
||||||
set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
|
set(OpenMP_omp_LIBRARY "${HOMEBREW_PREFIX}/opt/libomp/lib/libomp.dylib")
|
||||||
|
|
||||||
# Force use of system libc++ to avoid version mismatch
|
# Force use of system libc++ to avoid version mismatch
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Literal
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from leann.interface import (
|
from leann.interface import (
|
||||||
@@ -152,7 +152,7 @@ class HNSWSearcher(BaseSearcher):
|
|||||||
self,
|
self,
|
||||||
query: np.ndarray,
|
query: np.ndarray,
|
||||||
top_k: int,
|
top_k: int,
|
||||||
zmq_port: int | None = None,
|
zmq_port: Optional[int] = None,
|
||||||
complexity: int = 64,
|
complexity: int = 64,
|
||||||
beam_width: int = 1,
|
beam_width: int = 1,
|
||||||
prune_ratio: float = 0.0,
|
prune_ratio: float = 0.0,
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
import msgpack
|
import msgpack
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -33,7 +34,7 @@ if not logger.handlers:
|
|||||||
|
|
||||||
|
|
||||||
def create_hnsw_embedding_server(
|
def create_hnsw_embedding_server(
|
||||||
passages_file: str | None = None,
|
passages_file: Union[str, None] = None,
|
||||||
zmq_port: int = 5555,
|
zmq_port: int = 5555,
|
||||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||||
distance_metric: str = "mips",
|
distance_metric: str = "mips",
|
||||||
@@ -295,7 +296,7 @@ if __name__ == "__main__":
|
|||||||
"--embedding-mode",
|
"--embedding-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="sentence-transformers",
|
default="sentence-transformers",
|
||||||
choices=["sentence-transformers", "openai", "mlx"],
|
choices=["sentence-transformers", "openai", "mlx", "ollama"],
|
||||||
help="Embedding backend mode",
|
help="Embedding backend mode",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-backend-hnsw"
|
name = "leann-backend-hnsw"
|
||||||
version = "0.2.1"
|
version = "0.2.7"
|
||||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"leann-core==0.2.1",
|
"leann-core==0.2.7",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pyzmq>=23.0.0",
|
"pyzmq>=23.0.0",
|
||||||
"msgpack>=1.0.0",
|
"msgpack>=1.0.0",
|
||||||
@@ -22,6 +22,8 @@ cmake.build-type = "Release"
|
|||||||
build.verbose = true
|
build.verbose = true
|
||||||
build.tool-args = ["-j8"]
|
build.tool-args = ["-j8"]
|
||||||
|
|
||||||
# CMake definitions to optimize compilation
|
# CMake definitions to optimize compilation and find Homebrew packages
|
||||||
[tool.scikit-build.cmake.define]
|
[tool.scikit-build.cmake.define]
|
||||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||||
|
CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}
|
||||||
|
OpenMP_ROOT = {env = "OpenMP_ROOT"}
|
||||||
|
|||||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: ff22e2c86b...4a2c0d67d3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-core"
|
name = "leann-core"
|
||||||
version = "0.2.1"
|
version = "0.2.7"
|
||||||
description = "Core API and plugin system for LEANN"
|
description = "Core API and plugin system for LEANN"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
@@ -31,8 +31,10 @@ dependencies = [
|
|||||||
"PyPDF2>=3.0.0",
|
"PyPDF2>=3.0.0",
|
||||||
"pymupdf>=1.23.0",
|
"pymupdf>=1.23.0",
|
||||||
"pdfplumber>=0.10.0",
|
"pdfplumber>=0.10.0",
|
||||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
"nbconvert>=7.0.0", # For .ipynb file support
|
||||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
"gitignore-parser>=0.1.12", # For proper .gitignore handling
|
||||||
|
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||||
|
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
@@ -44,6 +46,7 @@ colab = [
|
|||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
leann = "leann.cli:main"
|
leann = "leann.cli:main"
|
||||||
|
leann_mcp = "leann.mcp:main"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
where = ["src"]
|
where = ["src"]
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Literal
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -33,7 +33,7 @@ def compute_embeddings(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
mode: str = "sentence-transformers",
|
mode: str = "sentence-transformers",
|
||||||
use_server: bool = True,
|
use_server: bool = True,
|
||||||
port: int | None = None,
|
port: Optional[int] = None,
|
||||||
is_build=False,
|
is_build=False,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
@@ -157,12 +157,12 @@ class LeannBuilder:
|
|||||||
self,
|
self,
|
||||||
backend_name: str,
|
backend_name: str,
|
||||||
embedding_model: str = "facebook/contriever",
|
embedding_model: str = "facebook/contriever",
|
||||||
dimensions: int | None = None,
|
dimensions: Optional[int] = None,
|
||||||
embedding_mode: str = "sentence-transformers",
|
embedding_mode: str = "sentence-transformers",
|
||||||
**backend_kwargs,
|
**backend_kwargs,
|
||||||
):
|
):
|
||||||
self.backend_name = backend_name
|
self.backend_name = backend_name
|
||||||
backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
|
backend_factory: Optional[LeannBackendFactoryInterface] = BACKEND_REGISTRY.get(backend_name)
|
||||||
if backend_factory is None:
|
if backend_factory is None:
|
||||||
raise ValueError(f"Backend '{backend_name}' not found or not registered.")
|
raise ValueError(f"Backend '{backend_name}' not found or not registered.")
|
||||||
self.backend_factory = backend_factory
|
self.backend_factory = backend_factory
|
||||||
@@ -242,7 +242,7 @@ class LeannBuilder:
|
|||||||
self.backend_kwargs = backend_kwargs
|
self.backend_kwargs = backend_kwargs
|
||||||
self.chunks: list[dict[str, Any]] = []
|
self.chunks: list[dict[str, Any]] = []
|
||||||
|
|
||||||
def add_text(self, text: str, metadata: dict[str, Any] | None = None):
|
def add_text(self, text: str, metadata: Optional[dict[str, Any]] = None):
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
metadata = {}
|
metadata = {}
|
||||||
passage_id = metadata.get("id", str(len(self.chunks)))
|
passage_id = metadata.get("id", str(len(self.chunks)))
|
||||||
@@ -554,7 +554,7 @@ class LeannSearcher:
|
|||||||
if "labels" in results and "distances" in results:
|
if "labels" in results and "distances" in results:
|
||||||
logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
|
logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
|
||||||
for i, (string_id, dist) in enumerate(
|
for i, (string_id, dist) in enumerate(
|
||||||
zip(results["labels"][0], results["distances"][0], strict=False)
|
zip(results["labels"][0], results["distances"][0])
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
passage_data = self.passage_manager.get_passage(string_id)
|
passage_data = self.passage_manager.get_passage(string_id)
|
||||||
@@ -592,7 +592,7 @@ class LeannChat:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
index_path: str,
|
index_path: str,
|
||||||
llm_config: dict[str, Any] | None = None,
|
llm_config: Optional[dict[str, Any]] = None,
|
||||||
enable_warmup: bool = False,
|
enable_warmup: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@@ -608,7 +608,7 @@ class LeannChat:
|
|||||||
prune_ratio: float = 0.0,
|
prune_ratio: float = 0.0,
|
||||||
recompute_embeddings: bool = True,
|
recompute_embeddings: bool = True,
|
||||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||||
llm_kwargs: dict[str, Any] | None = None,
|
llm_kwargs: Optional[dict[str, Any]] = None,
|
||||||
expected_zmq_port: int = 5557,
|
expected_zmq_port: int = 5557,
|
||||||
**search_kwargs,
|
**search_kwargs,
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import difflib
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any
|
from typing import Any, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -17,12 +17,12 @@ logging.basicConfig(level=logging.INFO)
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def check_ollama_models() -> list[str]:
|
def check_ollama_models(host: str) -> list[str]:
|
||||||
"""Check available Ollama models and return a list"""
|
"""Check available Ollama models and return a list"""
|
||||||
try:
|
try:
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
response = requests.get(f"{host}/api/tags", timeout=5)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return [model["name"] for model in data.get("models", [])]
|
return [model["name"] for model in data.get("models", [])]
|
||||||
@@ -309,10 +309,12 @@ def search_hf_models(query: str, limit: int = 10) -> list[str]:
|
|||||||
return search_hf_models_fuzzy(query, limit)
|
return search_hf_models_fuzzy(query, limit)
|
||||||
|
|
||||||
|
|
||||||
def validate_model_and_suggest(model_name: str, llm_type: str) -> str | None:
|
def validate_model_and_suggest(
|
||||||
|
model_name: str, llm_type: str, host: str = "http://localhost:11434"
|
||||||
|
) -> Optional[str]:
|
||||||
"""Validate model name and provide suggestions if invalid"""
|
"""Validate model name and provide suggestions if invalid"""
|
||||||
if llm_type == "ollama":
|
if llm_type == "ollama":
|
||||||
available_models = check_ollama_models()
|
available_models = check_ollama_models(host)
|
||||||
if available_models and model_name not in available_models:
|
if available_models and model_name not in available_models:
|
||||||
error_msg = f"Model '{model_name}' not found in your local Ollama installation."
|
error_msg = f"Model '{model_name}' not found in your local Ollama installation."
|
||||||
|
|
||||||
@@ -469,7 +471,7 @@ class OllamaChat(LLMInterface):
|
|||||||
requests.get(host)
|
requests.get(host)
|
||||||
|
|
||||||
# Pre-check model availability with helpful suggestions
|
# Pre-check model availability with helpful suggestions
|
||||||
model_error = validate_model_and_suggest(model, "ollama")
|
model_error = validate_model_and_suggest(model, "ollama", host)
|
||||||
if model_error:
|
if model_error:
|
||||||
raise ValueError(model_error)
|
raise ValueError(model_error)
|
||||||
|
|
||||||
@@ -683,7 +685,7 @@ class HFChat(LLMInterface):
|
|||||||
class OpenAIChat(LLMInterface):
|
class OpenAIChat(LLMInterface):
|
||||||
"""LLM interface for OpenAI models."""
|
"""LLM interface for OpenAI models."""
|
||||||
|
|
||||||
def __init__(self, model: str = "gpt-4o", api_key: str | None = None):
|
def __init__(self, model: str = "gpt-4o", api_key: Optional[str] = None):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
@@ -759,7 +761,7 @@ class SimulatedChat(LLMInterface):
|
|||||||
return "This is a simulated answer from the LLM based on the retrieved context."
|
return "This is a simulated answer from the LLM based on the retrieved context."
|
||||||
|
|
||||||
|
|
||||||
def get_llm(llm_config: dict[str, Any] | None = None) -> LLMInterface:
|
def get_llm(llm_config: Optional[dict[str, Any]] = None) -> LLMInterface:
|
||||||
"""
|
"""
|
||||||
Factory function to get an LLM interface based on configuration.
|
Factory function to get an LLM interface based on configuration.
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from llama_index.core import SimpleDirectoryReader
|
from llama_index.core import SimpleDirectoryReader
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||||
|
|
||||||
@@ -41,13 +43,23 @@ def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
|
|||||||
|
|
||||||
class LeannCLI:
|
class LeannCLI:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.indexes_dir = Path.home() / ".leann" / "indexes"
|
# Always use project-local .leann directory (like .git)
|
||||||
|
self.indexes_dir = Path.cwd() / ".leann" / "indexes"
|
||||||
self.indexes_dir.mkdir(parents=True, exist_ok=True)
|
self.indexes_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Default parser for documents
|
||||||
self.node_parser = SentenceSplitter(
|
self.node_parser = SentenceSplitter(
|
||||||
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
|
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Code-optimized parser
|
||||||
|
self.code_parser = SentenceSplitter(
|
||||||
|
chunk_size=512, # Larger chunks for code context
|
||||||
|
chunk_overlap=50, # Less overlap to preserve function boundaries
|
||||||
|
separator="\n", # Split by lines for code
|
||||||
|
paragraph_separator="\n\n", # Preserve logical code blocks
|
||||||
|
)
|
||||||
|
|
||||||
def get_index_path(self, index_name: str) -> str:
|
def get_index_path(self, index_name: str) -> str:
|
||||||
index_dir = self.indexes_dir / index_name
|
index_dir = self.indexes_dir / index_name
|
||||||
return str(index_dir / "documents.leann")
|
return str(index_dir / "documents.leann")
|
||||||
@@ -64,10 +76,14 @@ class LeannCLI:
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
leann build my-docs --docs ./documents # Build index named my-docs
|
leann build my-docs --docs ./documents # Build index from directory
|
||||||
leann search my-docs "query" # Search in my-docs index
|
leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
|
||||||
leann ask my-docs "question" # Ask my-docs index
|
leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
|
||||||
leann list # List all stored indexes
|
leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
|
||||||
|
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||||
|
leann search my-docs "query" # Search in my-docs index
|
||||||
|
leann ask my-docs "question" # Ask my-docs index
|
||||||
|
leann list # List all stored indexes
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -75,18 +91,38 @@ Examples:
|
|||||||
|
|
||||||
# Build command
|
# Build command
|
||||||
build_parser = subparsers.add_parser("build", help="Build document index")
|
build_parser = subparsers.add_parser("build", help="Build document index")
|
||||||
build_parser.add_argument("index_name", help="Index name")
|
build_parser.add_argument(
|
||||||
build_parser.add_argument("--docs", type=str, required=True, help="Documents directory")
|
"index_name", nargs="?", help="Index name (default: current directory name)"
|
||||||
|
)
|
||||||
|
build_parser.add_argument(
|
||||||
|
"--docs",
|
||||||
|
type=str,
|
||||||
|
nargs="+",
|
||||||
|
default=["."],
|
||||||
|
help="Documents directories and/or files (default: current directory)",
|
||||||
|
)
|
||||||
build_parser.add_argument(
|
build_parser.add_argument(
|
||||||
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
||||||
)
|
)
|
||||||
build_parser.add_argument("--embedding-model", type=str, default="facebook/contriever")
|
build_parser.add_argument("--embedding-model", type=str, default="facebook/contriever")
|
||||||
|
build_parser.add_argument(
|
||||||
|
"--embedding-mode",
|
||||||
|
type=str,
|
||||||
|
default="sentence-transformers",
|
||||||
|
choices=["sentence-transformers", "openai", "mlx", "ollama"],
|
||||||
|
help="Embedding backend mode (default: sentence-transformers)",
|
||||||
|
)
|
||||||
build_parser.add_argument("--force", "-f", action="store_true", help="Force rebuild")
|
build_parser.add_argument("--force", "-f", action="store_true", help="Force rebuild")
|
||||||
build_parser.add_argument("--graph-degree", type=int, default=32)
|
build_parser.add_argument("--graph-degree", type=int, default=32)
|
||||||
build_parser.add_argument("--complexity", type=int, default=64)
|
build_parser.add_argument("--complexity", type=int, default=64)
|
||||||
build_parser.add_argument("--num-threads", type=int, default=1)
|
build_parser.add_argument("--num-threads", type=int, default=1)
|
||||||
build_parser.add_argument("--compact", action="store_true", default=True)
|
build_parser.add_argument("--compact", action="store_true", default=True)
|
||||||
build_parser.add_argument("--recompute", action="store_true", default=True)
|
build_parser.add_argument("--recompute", action="store_true", default=True)
|
||||||
|
build_parser.add_argument(
|
||||||
|
"--file-types",
|
||||||
|
type=str,
|
||||||
|
help="Comma-separated list of file extensions to include (e.g., '.txt,.pdf,.pptx'). If not specified, uses default supported types.",
|
||||||
|
)
|
||||||
|
|
||||||
# Search command
|
# Search command
|
||||||
search_parser = subparsers.add_parser("search", help="Search documents")
|
search_parser = subparsers.add_parser("search", help="Search documents")
|
||||||
@@ -96,7 +132,12 @@ Examples:
|
|||||||
search_parser.add_argument("--complexity", type=int, default=64)
|
search_parser.add_argument("--complexity", type=int, default=64)
|
||||||
search_parser.add_argument("--beam-width", type=int, default=1)
|
search_parser.add_argument("--beam-width", type=int, default=1)
|
||||||
search_parser.add_argument("--prune-ratio", type=float, default=0.0)
|
search_parser.add_argument("--prune-ratio", type=float, default=0.0)
|
||||||
search_parser.add_argument("--recompute-embeddings", action="store_true")
|
search_parser.add_argument(
|
||||||
|
"--recompute-embeddings",
|
||||||
|
action="store_true",
|
||||||
|
default=True,
|
||||||
|
help="Recompute embeddings (default: True)",
|
||||||
|
)
|
||||||
search_parser.add_argument(
|
search_parser.add_argument(
|
||||||
"--pruning-strategy",
|
"--pruning-strategy",
|
||||||
choices=["global", "local", "proportional"],
|
choices=["global", "local", "proportional"],
|
||||||
@@ -119,7 +160,12 @@ Examples:
|
|||||||
ask_parser.add_argument("--complexity", type=int, default=32)
|
ask_parser.add_argument("--complexity", type=int, default=32)
|
||||||
ask_parser.add_argument("--beam-width", type=int, default=1)
|
ask_parser.add_argument("--beam-width", type=int, default=1)
|
||||||
ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
|
ask_parser.add_argument("--prune-ratio", type=float, default=0.0)
|
||||||
ask_parser.add_argument("--recompute-embeddings", action="store_true")
|
ask_parser.add_argument(
|
||||||
|
"--recompute-embeddings",
|
||||||
|
action="store_true",
|
||||||
|
default=True,
|
||||||
|
help="Recompute embeddings (default: True)",
|
||||||
|
)
|
||||||
ask_parser.add_argument(
|
ask_parser.add_argument(
|
||||||
"--pruning-strategy",
|
"--pruning-strategy",
|
||||||
choices=["global", "local", "proportional"],
|
choices=["global", "local", "proportional"],
|
||||||
@@ -138,82 +184,473 @@ Examples:
|
|||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
def register_project_dir(self):
|
||||||
|
"""Register current project directory in global registry"""
|
||||||
|
global_registry = Path.home() / ".leann" / "projects.json"
|
||||||
|
global_registry.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
current_dir = str(Path.cwd())
|
||||||
|
|
||||||
|
# Load existing registry
|
||||||
|
projects = []
|
||||||
|
if global_registry.exists():
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open(global_registry) as f:
|
||||||
|
projects = json.load(f)
|
||||||
|
except Exception:
|
||||||
|
projects = []
|
||||||
|
|
||||||
|
# Add current directory if not already present
|
||||||
|
if current_dir not in projects:
|
||||||
|
projects.append(current_dir)
|
||||||
|
|
||||||
|
# Save registry
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open(global_registry, "w") as f:
|
||||||
|
json.dump(projects, f, indent=2)
|
||||||
|
|
||||||
|
def _build_gitignore_parser(self, docs_dir: str):
|
||||||
|
"""Build gitignore parser using gitignore-parser library."""
|
||||||
|
from gitignore_parser import parse_gitignore
|
||||||
|
|
||||||
|
# Try to parse the root .gitignore
|
||||||
|
gitignore_path = Path(docs_dir) / ".gitignore"
|
||||||
|
|
||||||
|
if gitignore_path.exists():
|
||||||
|
try:
|
||||||
|
# gitignore-parser automatically handles all subdirectory .gitignore files!
|
||||||
|
matches = parse_gitignore(str(gitignore_path))
|
||||||
|
print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
|
||||||
|
return matches
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not parse .gitignore: {e}")
|
||||||
|
else:
|
||||||
|
print("📋 No .gitignore found")
|
||||||
|
|
||||||
|
# Fallback: basic pattern matching for essential files
|
||||||
|
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
|
||||||
|
|
||||||
|
def basic_matches(file_path):
|
||||||
|
path_parts = Path(file_path).parts
|
||||||
|
return any(part in essential_patterns for part in path_parts)
|
||||||
|
|
||||||
|
return basic_matches
|
||||||
|
|
||||||
|
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
|
||||||
|
"""Check if a file should be excluded using gitignore parser."""
|
||||||
|
return gitignore_matches(str(relative_path))
|
||||||
|
|
||||||
|
def _is_git_submodule(self, path: Path) -> bool:
|
||||||
|
"""Check if a path is a git submodule."""
|
||||||
|
try:
|
||||||
|
# Find the git repo root
|
||||||
|
current_dir = Path.cwd()
|
||||||
|
while current_dir != current_dir.parent:
|
||||||
|
if (current_dir / ".git").exists():
|
||||||
|
gitmodules_path = current_dir / ".gitmodules"
|
||||||
|
if gitmodules_path.exists():
|
||||||
|
# Read .gitmodules to check if this path is a submodule
|
||||||
|
gitmodules_content = gitmodules_path.read_text()
|
||||||
|
# Convert path to relative to git root
|
||||||
|
try:
|
||||||
|
relative_path = path.resolve().relative_to(current_dir)
|
||||||
|
# Check if this path appears in .gitmodules
|
||||||
|
return f"path = {relative_path}" in gitmodules_content
|
||||||
|
except ValueError:
|
||||||
|
# Path is not under git root
|
||||||
|
return False
|
||||||
|
break
|
||||||
|
current_dir = current_dir.parent
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
# If anything goes wrong, assume it's not a submodule
|
||||||
|
return False
|
||||||
|
|
||||||
def list_indexes(self):
|
def list_indexes(self):
|
||||||
print("Stored LEANN indexes:")
|
print("Stored LEANN indexes:")
|
||||||
|
|
||||||
if not self.indexes_dir.exists():
|
# Get all project directories with .leann
|
||||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
global_registry = Path.home() / ".leann" / "projects.json"
|
||||||
|
all_projects = []
|
||||||
|
|
||||||
|
if global_registry.exists():
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open(global_registry) as f:
|
||||||
|
all_projects = json.load(f)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Filter to only existing directories with .leann
|
||||||
|
valid_projects = []
|
||||||
|
for project_dir in all_projects:
|
||||||
|
project_path = Path(project_dir)
|
||||||
|
if project_path.exists() and (project_path / ".leann" / "indexes").exists():
|
||||||
|
valid_projects.append(project_path)
|
||||||
|
|
||||||
|
# Add current project if it has .leann but not in registry
|
||||||
|
current_path = Path.cwd()
|
||||||
|
if (current_path / ".leann" / "indexes").exists() and current_path not in valid_projects:
|
||||||
|
valid_projects.append(current_path)
|
||||||
|
|
||||||
|
if not valid_projects:
|
||||||
|
print(
|
||||||
|
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
index_dirs = [d for d in self.indexes_dir.iterdir() if d.is_dir()]
|
total_indexes = 0
|
||||||
|
current_dir = Path.cwd()
|
||||||
|
|
||||||
if not index_dirs:
|
for project_path in valid_projects:
|
||||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
indexes_dir = project_path / ".leann" / "indexes"
|
||||||
return
|
if not indexes_dir.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
print(f"Found {len(index_dirs)} indexes:")
|
index_dirs = [d for d in indexes_dir.iterdir() if d.is_dir()]
|
||||||
for i, index_dir in enumerate(index_dirs, 1):
|
if not index_dirs:
|
||||||
index_name = index_dir.name
|
continue
|
||||||
status = "✓" if self.index_exists(index_name) else "✗"
|
|
||||||
|
|
||||||
print(f" {i}. {index_name} [{status}]")
|
# Show project header
|
||||||
if self.index_exists(index_name):
|
if project_path == current_dir:
|
||||||
index_dir / "documents.leann.meta.json"
|
print(f"\n📁 Current project ({project_path}):")
|
||||||
size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
|
|
||||||
1024 * 1024
|
|
||||||
)
|
|
||||||
print(f" Size: {size_mb:.1f} MB")
|
|
||||||
|
|
||||||
if index_dirs:
|
|
||||||
example_name = index_dirs[0].name
|
|
||||||
print("\nUsage:")
|
|
||||||
print(f' leann search {example_name} "your query"')
|
|
||||||
print(f" leann ask {example_name} --interactive")
|
|
||||||
|
|
||||||
def load_documents(self, docs_dir: str):
|
|
||||||
print(f"Loading documents from {docs_dir}...")
|
|
||||||
|
|
||||||
# Try to use better PDF parsers first
|
|
||||||
documents = []
|
|
||||||
docs_path = Path(docs_dir)
|
|
||||||
|
|
||||||
for file_path in docs_path.rglob("*.pdf"):
|
|
||||||
print(f"Processing PDF: {file_path}")
|
|
||||||
|
|
||||||
# Try PyMuPDF first (best quality)
|
|
||||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
|
||||||
if text is None:
|
|
||||||
# Try pdfplumber
|
|
||||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
|
||||||
|
|
||||||
if text:
|
|
||||||
# Create a simple document structure
|
|
||||||
from llama_index.core import Document
|
|
||||||
|
|
||||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
|
||||||
documents.append(doc)
|
|
||||||
else:
|
else:
|
||||||
# Fallback to default reader
|
print(f"\n📂 {project_path}:")
|
||||||
print(f"Using default reader for {file_path}")
|
|
||||||
default_docs = SimpleDirectoryReader(
|
|
||||||
str(file_path.parent),
|
|
||||||
filename_as_id=True,
|
|
||||||
required_exts=[file_path.suffix],
|
|
||||||
).load_data()
|
|
||||||
documents.extend(default_docs)
|
|
||||||
|
|
||||||
# Load other file types with default reader
|
for index_dir in index_dirs:
|
||||||
other_docs = SimpleDirectoryReader(
|
total_indexes += 1
|
||||||
docs_dir,
|
index_name = index_dir.name
|
||||||
recursive=True,
|
meta_file = index_dir / "documents.leann.meta.json"
|
||||||
encoding="utf-8",
|
status = "✓" if meta_file.exists() else "✗"
|
||||||
required_exts=[".txt", ".md", ".docx"],
|
|
||||||
).load_data(show_progress=True)
|
print(f" {total_indexes}. {index_name} [{status}]")
|
||||||
documents.extend(other_docs)
|
if status == "✓":
|
||||||
|
size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
|
||||||
|
1024 * 1024
|
||||||
|
)
|
||||||
|
print(f" Size: {size_mb:.1f} MB")
|
||||||
|
|
||||||
|
if total_indexes > 0:
|
||||||
|
print(f"\nTotal: {total_indexes} indexes across {len(valid_projects)} projects")
|
||||||
|
print("\nUsage (current project only):")
|
||||||
|
|
||||||
|
# Show example from current project
|
||||||
|
current_indexes_dir = current_dir / ".leann" / "indexes"
|
||||||
|
if current_indexes_dir.exists():
|
||||||
|
current_index_dirs = [d for d in current_indexes_dir.iterdir() if d.is_dir()]
|
||||||
|
if current_index_dirs:
|
||||||
|
example_name = current_index_dirs[0].name
|
||||||
|
print(f' leann search {example_name} "your query"')
|
||||||
|
print(f" leann ask {example_name} --interactive")
|
||||||
|
|
||||||
|
def load_documents(
|
||||||
|
self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
|
||||||
|
):
|
||||||
|
# Handle both single path (string) and multiple paths (list) for backward compatibility
|
||||||
|
if isinstance(docs_paths, str):
|
||||||
|
docs_paths = [docs_paths]
|
||||||
|
|
||||||
|
# Separate files and directories
|
||||||
|
files = []
|
||||||
|
directories = []
|
||||||
|
for path in docs_paths:
|
||||||
|
path_obj = Path(path)
|
||||||
|
if path_obj.is_file():
|
||||||
|
files.append(str(path_obj))
|
||||||
|
elif path_obj.is_dir():
|
||||||
|
# Check if this is a git submodule - if so, skip it
|
||||||
|
if self._is_git_submodule(path_obj):
|
||||||
|
print(f"⚠️ Skipping git submodule: {path}")
|
||||||
|
continue
|
||||||
|
directories.append(str(path_obj))
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Print summary of what we're processing
|
||||||
|
total_items = len(files) + len(directories)
|
||||||
|
items_desc = []
|
||||||
|
if files:
|
||||||
|
items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
|
||||||
|
if directories:
|
||||||
|
items_desc.append(
|
||||||
|
f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
|
||||||
|
if files:
|
||||||
|
print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
|
||||||
|
if directories:
|
||||||
|
print(f" 📁 Directories: {', '.join(directories)}")
|
||||||
|
|
||||||
|
if custom_file_types:
|
||||||
|
print(f"Using custom file types: {custom_file_types}")
|
||||||
|
|
||||||
|
all_documents = []
|
||||||
|
|
||||||
|
# First, process individual files if any
|
||||||
|
if files:
|
||||||
|
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
|
||||||
|
|
||||||
|
# Load individual files using SimpleDirectoryReader with input_files
|
||||||
|
# Note: We skip gitignore filtering for explicitly specified files
|
||||||
|
try:
|
||||||
|
# Group files by their parent directory for efficient loading
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
files_by_dir = defaultdict(list)
|
||||||
|
for file_path in files:
|
||||||
|
parent_dir = str(Path(file_path).parent)
|
||||||
|
files_by_dir[parent_dir].append(file_path)
|
||||||
|
|
||||||
|
# Load files from each parent directory
|
||||||
|
for parent_dir, file_list in files_by_dir.items():
|
||||||
|
print(
|
||||||
|
f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
file_docs = SimpleDirectoryReader(
|
||||||
|
parent_dir,
|
||||||
|
input_files=file_list,
|
||||||
|
filename_as_id=True,
|
||||||
|
).load_data()
|
||||||
|
all_documents.extend(file_docs)
|
||||||
|
print(
|
||||||
|
f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error processing individual files: {e}")
|
||||||
|
|
||||||
|
# Define file extensions to process
|
||||||
|
if custom_file_types:
|
||||||
|
# Parse custom file types from comma-separated string
|
||||||
|
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
||||||
|
# Ensure extensions start with a dot
|
||||||
|
code_extensions = [ext if ext.startswith(".") else f".{ext}" for ext in code_extensions]
|
||||||
|
else:
|
||||||
|
# Use default supported file types
|
||||||
|
code_extensions = [
|
||||||
|
# Original document types
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
# Code files for Claude Code integration
|
||||||
|
".py",
|
||||||
|
".js",
|
||||||
|
".ts",
|
||||||
|
".jsx",
|
||||||
|
".tsx",
|
||||||
|
".java",
|
||||||
|
".cpp",
|
||||||
|
".c",
|
||||||
|
".h",
|
||||||
|
".hpp",
|
||||||
|
".cs",
|
||||||
|
".go",
|
||||||
|
".rs",
|
||||||
|
".rb",
|
||||||
|
".php",
|
||||||
|
".swift",
|
||||||
|
".kt",
|
||||||
|
".scala",
|
||||||
|
".r",
|
||||||
|
".sql",
|
||||||
|
".sh",
|
||||||
|
".bash",
|
||||||
|
".zsh",
|
||||||
|
".fish",
|
||||||
|
".ps1",
|
||||||
|
".bat",
|
||||||
|
# Config and markup files
|
||||||
|
".json",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".xml",
|
||||||
|
".toml",
|
||||||
|
".ini",
|
||||||
|
".cfg",
|
||||||
|
".conf",
|
||||||
|
".html",
|
||||||
|
".css",
|
||||||
|
".scss",
|
||||||
|
".less",
|
||||||
|
".vue",
|
||||||
|
".svelte",
|
||||||
|
# Data science
|
||||||
|
".ipynb",
|
||||||
|
".R",
|
||||||
|
".py",
|
||||||
|
".jl",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Process each directory
|
||||||
|
if directories:
|
||||||
|
print(
|
||||||
|
f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
for docs_dir in directories:
|
||||||
|
print(f"Processing directory: {docs_dir}")
|
||||||
|
# Build gitignore parser for each directory
|
||||||
|
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||||
|
|
||||||
|
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||||
|
documents = []
|
||||||
|
docs_path = Path(docs_dir)
|
||||||
|
|
||||||
|
# Check if we should process PDFs
|
||||||
|
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||||
|
|
||||||
|
if should_process_pdfs:
|
||||||
|
for file_path in docs_path.rglob("*.pdf"):
|
||||||
|
# Check if file matches any exclude pattern
|
||||||
|
try:
|
||||||
|
relative_path = file_path.relative_to(docs_path)
|
||||||
|
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
# Skip files that can't be made relative to docs_path
|
||||||
|
print(f"⚠️ Skipping file outside directory scope: {file_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Processing PDF: {file_path}")
|
||||||
|
|
||||||
|
# Try PyMuPDF first (best quality)
|
||||||
|
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||||
|
if text is None:
|
||||||
|
# Try pdfplumber
|
||||||
|
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Create a simple document structure
|
||||||
|
from llama_index.core import Document
|
||||||
|
|
||||||
|
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||||
|
documents.append(doc)
|
||||||
|
else:
|
||||||
|
# Fallback to default reader
|
||||||
|
print(f"Using default reader for {file_path}")
|
||||||
|
try:
|
||||||
|
default_docs = SimpleDirectoryReader(
|
||||||
|
str(file_path.parent),
|
||||||
|
filename_as_id=True,
|
||||||
|
required_exts=[file_path.suffix],
|
||||||
|
).load_data()
|
||||||
|
documents.extend(default_docs)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Could not process {file_path}: {e}")
|
||||||
|
|
||||||
|
# Load other file types with default reader
|
||||||
|
try:
|
||||||
|
# Create a custom file filter function using our PathSpec
|
||||||
|
def file_filter(
|
||||||
|
file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
|
||||||
|
) -> bool:
|
||||||
|
"""Return True if file should be included (not excluded)"""
|
||||||
|
try:
|
||||||
|
docs_path_obj = Path(docs_dir)
|
||||||
|
file_path_obj = Path(file_path)
|
||||||
|
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||||
|
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||||
|
except (ValueError, OSError):
|
||||||
|
return True # Include files that can't be processed
|
||||||
|
|
||||||
|
other_docs = SimpleDirectoryReader(
|
||||||
|
docs_dir,
|
||||||
|
recursive=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
required_exts=code_extensions,
|
||||||
|
file_extractor={}, # Use default extractors
|
||||||
|
filename_as_id=True,
|
||||||
|
).load_data(show_progress=True)
|
||||||
|
|
||||||
|
# Filter documents after loading based on gitignore rules
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in other_docs:
|
||||||
|
file_path = doc.metadata.get("file_path", "")
|
||||||
|
if file_filter(file_path):
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
|
||||||
|
documents.extend(filtered_docs)
|
||||||
|
except ValueError as e:
|
||||||
|
if "No files found" in str(e):
|
||||||
|
print(f"No additional files found for other supported types in {docs_dir}.")
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
all_documents.extend(documents)
|
||||||
|
print(f"Loaded {len(documents)} documents from {docs_dir}")
|
||||||
|
|
||||||
|
documents = all_documents
|
||||||
|
|
||||||
all_texts = []
|
all_texts = []
|
||||||
for doc in documents:
|
|
||||||
nodes = self.node_parser.get_nodes_from_documents([doc])
|
# Define code file extensions for intelligent chunking
|
||||||
|
code_file_exts = {
|
||||||
|
".py",
|
||||||
|
".js",
|
||||||
|
".ts",
|
||||||
|
".jsx",
|
||||||
|
".tsx",
|
||||||
|
".java",
|
||||||
|
".cpp",
|
||||||
|
".c",
|
||||||
|
".h",
|
||||||
|
".hpp",
|
||||||
|
".cs",
|
||||||
|
".go",
|
||||||
|
".rs",
|
||||||
|
".rb",
|
||||||
|
".php",
|
||||||
|
".swift",
|
||||||
|
".kt",
|
||||||
|
".scala",
|
||||||
|
".r",
|
||||||
|
".sql",
|
||||||
|
".sh",
|
||||||
|
".bash",
|
||||||
|
".zsh",
|
||||||
|
".fish",
|
||||||
|
".ps1",
|
||||||
|
".bat",
|
||||||
|
".json",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".xml",
|
||||||
|
".toml",
|
||||||
|
".ini",
|
||||||
|
".cfg",
|
||||||
|
".conf",
|
||||||
|
".html",
|
||||||
|
".css",
|
||||||
|
".scss",
|
||||||
|
".less",
|
||||||
|
".vue",
|
||||||
|
".svelte",
|
||||||
|
".ipynb",
|
||||||
|
".R",
|
||||||
|
".jl",
|
||||||
|
}
|
||||||
|
|
||||||
|
print("start chunking documents")
|
||||||
|
# Add progress bar for document chunking
|
||||||
|
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
|
||||||
|
# Check if this is a code file based on source path
|
||||||
|
source_path = doc.metadata.get("source", "")
|
||||||
|
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
|
||||||
|
|
||||||
|
# Use appropriate parser based on file type
|
||||||
|
parser = self.code_parser if is_code_file else self.node_parser
|
||||||
|
nodes = parser.get_nodes_from_documents([doc])
|
||||||
|
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
all_texts.append(node.get_content())
|
all_texts.append(node.get_content())
|
||||||
|
|
||||||
@@ -221,16 +658,36 @@ Examples:
|
|||||||
return all_texts
|
return all_texts
|
||||||
|
|
||||||
async def build_index(self, args):
|
async def build_index(self, args):
|
||||||
docs_dir = args.docs
|
docs_paths = args.docs
|
||||||
index_name = args.index_name
|
# Use current directory name if index_name not provided
|
||||||
|
if args.index_name:
|
||||||
|
index_name = args.index_name
|
||||||
|
else:
|
||||||
|
index_name = Path.cwd().name
|
||||||
|
print(f"Using current directory name as index: '{index_name}'")
|
||||||
|
|
||||||
index_dir = self.indexes_dir / index_name
|
index_dir = self.indexes_dir / index_name
|
||||||
index_path = self.get_index_path(index_name)
|
index_path = self.get_index_path(index_name)
|
||||||
|
|
||||||
|
# Display all paths being indexed with file/directory distinction
|
||||||
|
files = [p for p in docs_paths if Path(p).is_file()]
|
||||||
|
directories = [p for p in docs_paths if Path(p).is_dir()]
|
||||||
|
|
||||||
|
print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
|
||||||
|
if files:
|
||||||
|
print(f" 📄 Files ({len(files)}):")
|
||||||
|
for i, file_path in enumerate(files, 1):
|
||||||
|
print(f" {i}. {Path(file_path).resolve()}")
|
||||||
|
if directories:
|
||||||
|
print(f" 📁 Directories ({len(directories)}):")
|
||||||
|
for i, dir_path in enumerate(directories, 1):
|
||||||
|
print(f" {i}. {Path(dir_path).resolve()}")
|
||||||
|
|
||||||
if index_dir.exists() and not args.force:
|
if index_dir.exists() and not args.force:
|
||||||
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
||||||
return
|
return
|
||||||
|
|
||||||
all_texts = self.load_documents(docs_dir)
|
all_texts = self.load_documents(docs_paths, args.file_types)
|
||||||
if not all_texts:
|
if not all_texts:
|
||||||
print("No documents found")
|
print("No documents found")
|
||||||
return
|
return
|
||||||
@@ -242,6 +699,7 @@ Examples:
|
|||||||
builder = LeannBuilder(
|
builder = LeannBuilder(
|
||||||
backend_name=args.backend,
|
backend_name=args.backend,
|
||||||
embedding_model=args.embedding_model,
|
embedding_model=args.embedding_model,
|
||||||
|
embedding_mode=args.embedding_mode,
|
||||||
graph_degree=args.graph_degree,
|
graph_degree=args.graph_degree,
|
||||||
complexity=args.complexity,
|
complexity=args.complexity,
|
||||||
is_compact=args.compact,
|
is_compact=args.compact,
|
||||||
@@ -255,6 +713,9 @@ Examples:
|
|||||||
builder.build_index(index_path)
|
builder.build_index(index_path)
|
||||||
print(f"Index built at {index_path}")
|
print(f"Index built at {index_path}")
|
||||||
|
|
||||||
|
# Register this project directory in global registry
|
||||||
|
self.register_project_dir()
|
||||||
|
|
||||||
async def search_documents(self, args):
|
async def search_documents(self, args):
|
||||||
index_name = args.index_name
|
index_name = args.index_name
|
||||||
query = args.query
|
query = args.query
|
||||||
@@ -262,7 +723,7 @@ Examples:
|
|||||||
|
|
||||||
if not self.index_exists(index_name):
|
if not self.index_exists(index_name):
|
||||||
print(
|
print(
|
||||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -289,7 +750,7 @@ Examples:
|
|||||||
|
|
||||||
if not self.index_exists(index_name):
|
if not self.index_exists(index_name):
|
||||||
print(
|
print(
|
||||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def compute_embeddings(
|
|||||||
Args:
|
Args:
|
||||||
texts: List of texts to compute embeddings for
|
texts: List of texts to compute embeddings for
|
||||||
model_name: Model name
|
model_name: Model name
|
||||||
mode: Computation mode ('sentence-transformers', 'openai', 'mlx')
|
mode: Computation mode ('sentence-transformers', 'openai', 'mlx', 'ollama')
|
||||||
is_build: Whether this is a build operation (shows progress bar)
|
is_build: Whether this is a build operation (shows progress bar)
|
||||||
batch_size: Batch size for processing
|
batch_size: Batch size for processing
|
||||||
adaptive_optimization: Whether to use adaptive optimization based on batch size
|
adaptive_optimization: Whether to use adaptive optimization based on batch size
|
||||||
@@ -55,6 +55,8 @@ def compute_embeddings(
|
|||||||
return compute_embeddings_openai(texts, model_name)
|
return compute_embeddings_openai(texts, model_name)
|
||||||
elif mode == "mlx":
|
elif mode == "mlx":
|
||||||
return compute_embeddings_mlx(texts, model_name)
|
return compute_embeddings_mlx(texts, model_name)
|
||||||
|
elif mode == "ollama":
|
||||||
|
return compute_embeddings_ollama(texts, model_name, is_build=is_build)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported embedding mode: {mode}")
|
raise ValueError(f"Unsupported embedding mode: {mode}")
|
||||||
|
|
||||||
@@ -365,3 +367,286 @@ def compute_embeddings_mlx(chunks: list[str], model_name: str, batch_size: int =
|
|||||||
|
|
||||||
# Stack numpy arrays
|
# Stack numpy arrays
|
||||||
return np.stack(all_embeddings)
|
return np.stack(all_embeddings)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_embeddings_ollama(
|
||||||
|
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Compute embeddings using Ollama API with simplified batch processing.
|
||||||
|
|
||||||
|
Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to compute embeddings for
|
||||||
|
model_name: Ollama model name (e.g., "nomic-embed-text", "mxbai-embed-large")
|
||||||
|
is_build: Whether this is a build operation (shows progress bar)
|
||||||
|
host: Ollama host URL (default: http://localhost:11434)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized embeddings array, shape: (len(texts), embedding_dim)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"The 'requests' library is required for Ollama embeddings. Install with: uv pip install requests"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not texts:
|
||||||
|
raise ValueError("Cannot compute embeddings for empty text list")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Computing embeddings for {len(texts)} texts using Ollama API, model: '{model_name}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if Ollama is running
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{host}/api/version", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
error_msg = (
|
||||||
|
f"❌ Could not connect to Ollama at {host}.\n\n"
|
||||||
|
"Please ensure Ollama is running:\n"
|
||||||
|
" • macOS/Linux: ollama serve\n"
|
||||||
|
" • Windows: Make sure Ollama is running in the system tray\n\n"
|
||||||
|
"Installation: https://ollama.com/download"
|
||||||
|
)
|
||||||
|
raise RuntimeError(error_msg)
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Unexpected error connecting to Ollama: {e}")
|
||||||
|
|
||||||
|
# Check if model exists and provide helpful suggestions
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{host}/api/tags", timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
models = response.json()
|
||||||
|
model_names = [model["name"] for model in models.get("models", [])]
|
||||||
|
|
||||||
|
# Filter for embedding models (models that support embeddings)
|
||||||
|
embedding_models = []
|
||||||
|
suggested_embedding_models = [
|
||||||
|
"nomic-embed-text",
|
||||||
|
"mxbai-embed-large",
|
||||||
|
"bge-m3",
|
||||||
|
"all-minilm",
|
||||||
|
"snowflake-arctic-embed",
|
||||||
|
]
|
||||||
|
|
||||||
|
for model in model_names:
|
||||||
|
# Check if it's an embedding model (by name patterns or known models)
|
||||||
|
base_name = model.split(":")[0]
|
||||||
|
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
|
||||||
|
embedding_models.append(model)
|
||||||
|
|
||||||
|
# Check if model exists (handle versioned names) and resolve to full name
|
||||||
|
resolved_model_name = None
|
||||||
|
for name in model_names:
|
||||||
|
# Exact match
|
||||||
|
if model_name == name:
|
||||||
|
resolved_model_name = name
|
||||||
|
break
|
||||||
|
# Match without version tag (use the versioned name)
|
||||||
|
elif model_name == name.split(":")[0]:
|
||||||
|
resolved_model_name = name
|
||||||
|
break
|
||||||
|
|
||||||
|
if not resolved_model_name:
|
||||||
|
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
|
||||||
|
|
||||||
|
# Suggest pulling the model
|
||||||
|
error_msg += "📦 To install this embedding model:\n"
|
||||||
|
error_msg += f" ollama pull {model_name}\n\n"
|
||||||
|
|
||||||
|
# Show available embedding models
|
||||||
|
if embedding_models:
|
||||||
|
error_msg += "✅ Available embedding models:\n"
|
||||||
|
for model in embedding_models[:5]:
|
||||||
|
error_msg += f" • {model}\n"
|
||||||
|
if len(embedding_models) > 5:
|
||||||
|
error_msg += f" ... and {len(embedding_models) - 5} more\n"
|
||||||
|
else:
|
||||||
|
error_msg += "💡 Popular embedding models to install:\n"
|
||||||
|
for model in suggested_embedding_models[:3]:
|
||||||
|
error_msg += f" • ollama pull {model}\n"
|
||||||
|
|
||||||
|
error_msg += "\n📚 Browse more: https://ollama.com/library"
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Use the resolved model name for all subsequent operations
|
||||||
|
if resolved_model_name != model_name:
|
||||||
|
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
|
||||||
|
model_name = resolved_model_name
|
||||||
|
|
||||||
|
# Verify the model supports embeddings by testing it
|
||||||
|
try:
|
||||||
|
test_response = requests.post(
|
||||||
|
f"{host}/api/embeddings", json={"model": model_name, "prompt": "test"}, timeout=10
|
||||||
|
)
|
||||||
|
if test_response.status_code != 200:
|
||||||
|
error_msg = (
|
||||||
|
f"⚠️ Model '{model_name}' exists but may not support embeddings.\n\n"
|
||||||
|
f"Please use an embedding model like:\n"
|
||||||
|
)
|
||||||
|
for model in suggested_embedding_models[:3]:
|
||||||
|
error_msg += f" • {model}\n"
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
# If test fails, continue anyway - model might still work
|
||||||
|
pass
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.warning(f"Could not verify model existence: {e}")
|
||||||
|
|
||||||
|
# Determine batch size based on device availability
|
||||||
|
# Check for CUDA/MPS availability using torch if available
|
||||||
|
batch_size = 32 # Default for MPS/CPU
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
batch_size = 128 # CUDA gets larger batch size
|
||||||
|
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||||
|
batch_size = 32 # MPS gets smaller batch size
|
||||||
|
except ImportError:
|
||||||
|
# If torch is not available, use conservative batch size
|
||||||
|
batch_size = 32
|
||||||
|
|
||||||
|
logger.info(f"Using batch size: {batch_size}")
|
||||||
|
|
||||||
|
def get_batch_embeddings(batch_texts):
|
||||||
|
"""Get embeddings for a batch of texts."""
|
||||||
|
all_embeddings = []
|
||||||
|
failed_indices = []
|
||||||
|
|
||||||
|
for i, text in enumerate(batch_texts):
|
||||||
|
max_retries = 3
|
||||||
|
retry_count = 0
|
||||||
|
|
||||||
|
# Truncate very long texts to avoid API issues
|
||||||
|
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||||
|
while retry_count < max_retries:
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
f"{host}/api/embeddings",
|
||||||
|
json={"model": model_name, "prompt": truncated_text},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
embedding = result.get("embedding")
|
||||||
|
|
||||||
|
if embedding is None:
|
||||||
|
raise ValueError(f"No embedding returned for text {i}")
|
||||||
|
|
||||||
|
if not isinstance(embedding, list) or len(embedding) == 0:
|
||||||
|
raise ValueError(f"Invalid embedding format for text {i}")
|
||||||
|
|
||||||
|
all_embeddings.append(embedding)
|
||||||
|
break
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
retry_count += 1
|
||||||
|
if retry_count >= max_retries:
|
||||||
|
logger.warning(f"Timeout for text {i} after {max_retries} retries")
|
||||||
|
failed_indices.append(i)
|
||||||
|
all_embeddings.append(None)
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
retry_count += 1
|
||||||
|
if retry_count >= max_retries:
|
||||||
|
logger.error(f"Failed to get embedding for text {i}: {e}")
|
||||||
|
failed_indices.append(i)
|
||||||
|
all_embeddings.append(None)
|
||||||
|
break
|
||||||
|
return all_embeddings, failed_indices
|
||||||
|
|
||||||
|
# Process texts in batches
|
||||||
|
all_embeddings = []
|
||||||
|
all_failed_indices = []
|
||||||
|
|
||||||
|
# Setup progress bar if needed
|
||||||
|
show_progress = is_build or len(texts) > 10
|
||||||
|
try:
|
||||||
|
if show_progress:
|
||||||
|
from tqdm import tqdm
|
||||||
|
except ImportError:
|
||||||
|
show_progress = False
|
||||||
|
|
||||||
|
# Process batches
|
||||||
|
num_batches = (len(texts) + batch_size - 1) // batch_size
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
|
||||||
|
else:
|
||||||
|
batch_iterator = range(num_batches)
|
||||||
|
|
||||||
|
for batch_idx in batch_iterator:
|
||||||
|
start_idx = batch_idx * batch_size
|
||||||
|
end_idx = min(start_idx + batch_size, len(texts))
|
||||||
|
batch_texts = texts[start_idx:end_idx]
|
||||||
|
|
||||||
|
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
|
||||||
|
|
||||||
|
# Adjust failed indices to global indices
|
||||||
|
global_failed = [start_idx + idx for idx in batch_failed]
|
||||||
|
all_failed_indices.extend(global_failed)
|
||||||
|
all_embeddings.extend(batch_embeddings)
|
||||||
|
|
||||||
|
# Handle failed embeddings
|
||||||
|
if all_failed_indices:
|
||||||
|
if len(all_failed_indices) == len(texts):
|
||||||
|
raise RuntimeError("Failed to compute any embeddings")
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use zero embeddings as fallback for failed ones
|
||||||
|
valid_embedding = next((e for e in all_embeddings if e is not None), None)
|
||||||
|
if valid_embedding:
|
||||||
|
embedding_dim = len(valid_embedding)
|
||||||
|
for i, embedding in enumerate(all_embeddings):
|
||||||
|
if embedding is None:
|
||||||
|
all_embeddings[i] = [0.0] * embedding_dim
|
||||||
|
|
||||||
|
# Remove None values
|
||||||
|
all_embeddings = [e for e in all_embeddings if e is not None]
|
||||||
|
|
||||||
|
if not all_embeddings:
|
||||||
|
raise RuntimeError("No valid embeddings were computed")
|
||||||
|
|
||||||
|
# Validate embedding dimensions
|
||||||
|
expected_dim = len(all_embeddings[0])
|
||||||
|
inconsistent_dims = []
|
||||||
|
for i, embedding in enumerate(all_embeddings):
|
||||||
|
if len(embedding) != expected_dim:
|
||||||
|
inconsistent_dims.append((i, len(embedding)))
|
||||||
|
|
||||||
|
if inconsistent_dims:
|
||||||
|
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
||||||
|
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
||||||
|
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
||||||
|
if len(inconsistent_dims) > 10:
|
||||||
|
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
||||||
|
error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
||||||
|
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
||||||
|
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
||||||
|
error_msg += (
|
||||||
|
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
||||||
|
)
|
||||||
|
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
# Convert to numpy array and normalize
|
||||||
|
embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||||
|
|
||||||
|
# Normalize embeddings (L2 normalization)
|
||||||
|
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
||||||
|
embeddings = embeddings / (norms + 1e-8) # Add small epsilon to avoid division by zero
|
||||||
|
|
||||||
|
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
|
||||||
|
|
||||||
|
return embeddings
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
@@ -182,8 +183,8 @@ class EmbeddingServerManager:
|
|||||||
e.g., "leann_backend_diskann.embedding_server"
|
e.g., "leann_backend_diskann.embedding_server"
|
||||||
"""
|
"""
|
||||||
self.backend_module_name = backend_module_name
|
self.backend_module_name = backend_module_name
|
||||||
self.server_process: subprocess.Popen | None = None
|
self.server_process: Optional[subprocess.Popen] = None
|
||||||
self.server_port: int | None = None
|
self.server_port: Optional[int] = None
|
||||||
self._atexit_registered = False
|
self._atexit_registered = False
|
||||||
|
|
||||||
def start_server(
|
def start_server(
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Literal
|
from typing import Any, Literal, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -34,7 +34,9 @@ class LeannBackendSearcherInterface(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _ensure_server_running(self, passages_source_file: str, port: int | None, **kwargs) -> int:
|
def _ensure_server_running(
|
||||||
|
self, passages_source_file: str, port: Union[int, None], **kwargs
|
||||||
|
) -> int:
|
||||||
"""Ensure server is running"""
|
"""Ensure server is running"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -48,7 +50,7 @@ class LeannBackendSearcherInterface(ABC):
|
|||||||
prune_ratio: float = 0.0,
|
prune_ratio: float = 0.0,
|
||||||
recompute_embeddings: bool = False,
|
recompute_embeddings: bool = False,
|
||||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||||
zmq_port: int | None = None,
|
zmq_port: Union[int, None] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Search for nearest neighbors
|
"""Search for nearest neighbors
|
||||||
@@ -74,7 +76,7 @@ class LeannBackendSearcherInterface(ABC):
|
|||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
use_server_if_available: bool = True,
|
use_server_if_available: bool = True,
|
||||||
zmq_port: int | None = None,
|
zmq_port: Union[int, None] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Compute embedding for a query string
|
"""Compute embedding for a query string
|
||||||
|
|
||||||
|
|||||||
176
packages/leann-core/src/leann/mcp.py
Executable file
176
packages/leann-core/src/leann/mcp.py
Executable file
@@ -0,0 +1,176 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def handle_request(request):
|
||||||
|
if request.get("method") == "initialize":
|
||||||
|
return {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": request.get("id"),
|
||||||
|
"result": {
|
||||||
|
"capabilities": {"tools": {}},
|
||||||
|
"protocolVersion": "2024-11-05",
|
||||||
|
"serverInfo": {"name": "leann-mcp", "version": "1.0.0"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
elif request.get("method") == "tools/list":
|
||||||
|
return {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": request.get("id"),
|
||||||
|
"result": {
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"name": "leann_search",
|
||||||
|
"description": """🔍 Search code using natural language - like having a coding assistant who knows your entire codebase!
|
||||||
|
|
||||||
|
🎯 **Perfect for**:
|
||||||
|
- "How does authentication work?" → finds auth-related code
|
||||||
|
- "Error handling patterns" → locates try-catch blocks and error logic
|
||||||
|
- "Database connection setup" → finds DB initialization code
|
||||||
|
- "API endpoint definitions" → locates route handlers
|
||||||
|
- "Configuration management" → finds config files and usage
|
||||||
|
|
||||||
|
💡 **Pro tip**: Use this before making any changes to understand existing patterns and conventions.""",
|
||||||
|
"inputSchema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"index_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Name of the LEANN index to search. Use 'leann_list' first to see available indexes.",
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Search query - can be natural language (e.g., 'how to handle errors') or technical terms (e.g., 'async function definition')",
|
||||||
|
},
|
||||||
|
"top_k": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 5,
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 20,
|
||||||
|
"description": "Number of search results to return. Use 5-10 for focused results, 15-20 for comprehensive exploration.",
|
||||||
|
},
|
||||||
|
"complexity": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 32,
|
||||||
|
"minimum": 16,
|
||||||
|
"maximum": 128,
|
||||||
|
"description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["index_name", "query"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "leann_status",
|
||||||
|
"description": "📊 Check the health and stats of your code indexes - like a medical checkup for your codebase knowledge!",
|
||||||
|
"inputSchema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"index_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Optional: Name of specific index to check. If not provided, shows status of all indexes.",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "leann_list",
|
||||||
|
"description": "📋 Show all your indexed codebases - your personal code library! Use this to see what's available for search.",
|
||||||
|
"inputSchema": {"type": "object", "properties": {}},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
elif request.get("method") == "tools/call":
|
||||||
|
tool_name = request["params"]["name"]
|
||||||
|
args = request["params"].get("arguments", {})
|
||||||
|
|
||||||
|
try:
|
||||||
|
if tool_name == "leann_search":
|
||||||
|
# Validate required parameters
|
||||||
|
if not args.get("index_name") or not args.get("query"):
|
||||||
|
return {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": request.get("id"),
|
||||||
|
"result": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Error: Both index_name and query are required",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build simplified command
|
||||||
|
cmd = [
|
||||||
|
"leann",
|
||||||
|
"search",
|
||||||
|
args["index_name"],
|
||||||
|
args["query"],
|
||||||
|
f"--top-k={args.get('top_k', 5)}",
|
||||||
|
f"--complexity={args.get('complexity', 32)}",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
|
elif tool_name == "leann_status":
|
||||||
|
if args.get("index_name"):
|
||||||
|
# Check specific index status - for now, we'll use leann list and filter
|
||||||
|
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
|
||||||
|
# We could enhance this to show more detailed status per index
|
||||||
|
else:
|
||||||
|
# Show all indexes status
|
||||||
|
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
|
||||||
|
|
||||||
|
elif tool_name == "leann_list":
|
||||||
|
result = subprocess.run(["leann", "list"], capture_output=True, text=True)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": request.get("id"),
|
||||||
|
"result": {
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": result.stdout
|
||||||
|
if result.returncode == 0
|
||||||
|
else f"Error: {result.stderr}",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": request.get("id"),
|
||||||
|
"error": {"code": -1, "message": str(e)},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
for line in sys.stdin:
|
||||||
|
try:
|
||||||
|
request = json.loads(line.strip())
|
||||||
|
response = handle_request(request)
|
||||||
|
if response:
|
||||||
|
print(json.dumps(response))
|
||||||
|
sys.stdout.flush()
|
||||||
|
except Exception as e:
|
||||||
|
error_response = {
|
||||||
|
"jsonrpc": "2.0",
|
||||||
|
"id": None,
|
||||||
|
"error": {"code": -1, "message": str(e)},
|
||||||
|
}
|
||||||
|
print(json.dumps(error_response))
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Literal
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -169,7 +169,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
|||||||
prune_ratio: float = 0.0,
|
prune_ratio: float = 0.0,
|
||||||
recompute_embeddings: bool = False,
|
recompute_embeddings: bool = False,
|
||||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||||
zmq_port: int | None = None,
|
zmq_port: Optional[int] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
127
packages/leann-mcp/README.md
Normal file
127
packages/leann-mcp/README.md
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# 🔥 LEANN Claude Code Integration
|
||||||
|
|
||||||
|
Transform your development workflow with intelligent code assistance using LEANN's semantic search directly in Claude Code.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
**Step 1:** First, complete the basic LEANN installation following the [📦 Installation guide](../../README.md#installation) in the root README:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
uv pip install leann
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2:** Install LEANN globally for MCP integration:
|
||||||
|
```bash
|
||||||
|
uv tool install leann-core
|
||||||
|
```
|
||||||
|
|
||||||
|
This makes the `leann` command available system-wide, which `leann_mcp` requires.
|
||||||
|
|
||||||
|
## 🚀 Quick Setup
|
||||||
|
|
||||||
|
Add the LEANN MCP server to Claude Code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
claude mcp add leann-server -- leann_mcp
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🛠️ Available Tools
|
||||||
|
|
||||||
|
Once connected, you'll have access to these powerful semantic search tools in Claude Code:
|
||||||
|
|
||||||
|
- **`leann_list`** - List all available indexes across your projects
|
||||||
|
- **`leann_search`** - Perform semantic searches across code and documents
|
||||||
|
- **`leann_ask`** - Ask natural language questions and get AI-powered answers from your codebase
|
||||||
|
|
||||||
|
## 🎯 Quick Start Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build an index for your project (change to your actual path)
|
||||||
|
leann build my-project --docs ./
|
||||||
|
|
||||||
|
# Start Claude Code
|
||||||
|
claude
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🚀 Advanced Usage Examples
|
||||||
|
|
||||||
|
### Index Entire Git Repository
|
||||||
|
```bash
|
||||||
|
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
|
||||||
|
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index only specific file types from git
|
||||||
|
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multiple Directories and Files
|
||||||
|
```bash
|
||||||
|
# Index multiple directories
|
||||||
|
leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Mix files and directories
|
||||||
|
leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Specific files only
|
||||||
|
leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
### Advanced Git Integration
|
||||||
|
```bash
|
||||||
|
# Index recently modified files
|
||||||
|
leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index files matching pattern
|
||||||
|
leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
|
||||||
|
# Index documentation and config files
|
||||||
|
leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
**Try this in Claude Code:**
|
||||||
|
```
|
||||||
|
Help me understand this codebase. List available indexes and search for authentication patterns.
|
||||||
|
```
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="../../assets/claude_code_leann.png" alt="LEANN in Claude Code" width="80%">
|
||||||
|
</p>
|
||||||
|
|
||||||
|
|
||||||
|
## 🧠 How It Works
|
||||||
|
|
||||||
|
The integration consists of three key components working seamlessly together:
|
||||||
|
|
||||||
|
- **`leann`** - Core CLI tool for indexing and searching (installed globally via `uv tool install`)
|
||||||
|
- **`leann_mcp`** - MCP server that wraps `leann` commands for Claude Code integration
|
||||||
|
- **Claude Code** - Calls `leann_mcp`, which executes `leann` commands and returns intelligent results
|
||||||
|
|
||||||
|
## 📁 File Support
|
||||||
|
|
||||||
|
LEANN understands **30+ file types** including:
|
||||||
|
- **Programming**: Python, JavaScript, TypeScript, Java, Go, Rust, C++, C#
|
||||||
|
- **Data**: SQL, YAML, JSON, CSV, XML
|
||||||
|
- **Documentation**: Markdown, TXT, PDF
|
||||||
|
- **And many more!**
|
||||||
|
|
||||||
|
## 💾 Storage & Organization
|
||||||
|
|
||||||
|
- **Project indexes**: Stored in `.leann/` directory (just like `.git`)
|
||||||
|
- **Global registry**: Project tracking at `~/.leann/projects.json`
|
||||||
|
- **Multi-project support**: Switch between different codebases seamlessly
|
||||||
|
- **Portable**: Transfer indexes between machines with minimal overhead
|
||||||
|
|
||||||
|
## 🗑️ Uninstalling
|
||||||
|
|
||||||
|
To remove the LEANN MCP server from Claude Code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
claude mcp remove leann-server
|
||||||
|
```
|
||||||
|
To remove LEANN
|
||||||
|
```
|
||||||
|
uv pip uninstall leann leann-backend-hnsw leann-core
|
||||||
|
```
|
||||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann"
|
name = "leann"
|
||||||
version = "0.2.1"
|
version = "0.2.7"
|
||||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ dependencies = [
|
|||||||
"pypdfium2>=4.30.0",
|
"pypdfium2>=4.30.0",
|
||||||
# LlamaIndex core and readers - updated versions
|
# LlamaIndex core and readers - updated versions
|
||||||
"llama-index>=0.12.44",
|
"llama-index>=0.12.44",
|
||||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||||
# "llama-index-readers-docling", # Requires Python >= 3.10
|
# "llama-index-readers-docling", # Requires Python >= 3.10
|
||||||
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
||||||
"llama-index-vector-stores-faiss>=0.4.0",
|
"llama-index-vector-stores-faiss>=0.4.0",
|
||||||
@@ -40,9 +40,12 @@ dependencies = [
|
|||||||
# Other dependencies
|
# Other dependencies
|
||||||
"ipykernel==6.29.5",
|
"ipykernel==6.29.5",
|
||||||
"msgpack>=1.1.1",
|
"msgpack>=1.1.1",
|
||||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||||
"psutil>=5.8.0",
|
"psutil>=5.8.0",
|
||||||
|
"pathspec>=0.12.1",
|
||||||
|
"nbconvert>=7.16.6",
|
||||||
|
"gitignore-parser>=0.1.12",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
@@ -88,7 +91,7 @@ leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = tr
|
|||||||
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
|
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
target-version = "py310"
|
target-version = "py39"
|
||||||
line-length = 100
|
line-length = 100
|
||||||
extend-exclude = [
|
extend-exclude = [
|
||||||
"third_party",
|
"third_party",
|
||||||
|
|||||||
Reference in New Issue
Block a user