Compare commits
13 Commits
add-macos1
...
clean-stat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
909d3cc6a8 | ||
|
|
c994635af6 | ||
|
|
23b80647c5 | ||
|
|
50121972ee | ||
|
|
07e5f10204 | ||
|
|
58711bff7e | ||
|
|
a69464eb16 | ||
|
|
46565b9249 | ||
|
|
3dad76126a | ||
|
|
18e28bda32 | ||
|
|
609fa62fd5 | ||
|
|
eab13434ef | ||
|
|
b2390ccc14 |
51
.github/workflows/build-reusable.yml
vendored
51
.github/workflows/build-reusable.yml
vendored
@@ -64,6 +64,16 @@ jobs:
|
||||
python: '3.12'
|
||||
- os: macos-14
|
||||
python: '3.13'
|
||||
- os: macos-15
|
||||
python: '3.9'
|
||||
- os: macos-15
|
||||
python: '3.10'
|
||||
- os: macos-15
|
||||
python: '3.11'
|
||||
- os: macos-15
|
||||
python: '3.12'
|
||||
- os: macos-15
|
||||
python: '3.13'
|
||||
- os: macos-13
|
||||
python: '3.9'
|
||||
- os: macos-13
|
||||
@@ -147,7 +157,14 @@ jobs:
|
||||
# Use system clang for better compatibility
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
# Homebrew libraries on each macOS version require matching minimum version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||
fi
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
else
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
@@ -161,7 +178,14 @@ jobs:
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||
# But Homebrew libraries on each macOS version require matching minimum version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||
fi
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
else
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
@@ -197,10 +221,24 @@ jobs:
|
||||
- name: Repair wheels (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
# Determine deployment target based on runner OS
|
||||
# Must match the Homebrew libraries for each macOS version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
HNSW_TARGET="13.0"
|
||||
DISKANN_TARGET="13.3"
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
HNSW_TARGET="14.0"
|
||||
DISKANN_TARGET="14.0"
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
HNSW_TARGET="15.0"
|
||||
DISKANN_TARGET="15.0"
|
||||
fi
|
||||
|
||||
# Repair HNSW wheel
|
||||
cd packages/leann-backend-hnsw
|
||||
if [ -d dist ]; then
|
||||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||||
export MACOSX_DEPLOYMENT_TARGET=$HNSW_TARGET
|
||||
delocate-wheel -w dist_repaired -v --require-target-macos-version $HNSW_TARGET dist/*.whl
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
fi
|
||||
@@ -209,7 +247,8 @@ jobs:
|
||||
# Repair DiskANN wheel
|
||||
cd packages/leann-backend-diskann
|
||||
if [ -d dist ]; then
|
||||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||||
export MACOSX_DEPLOYMENT_TARGET=$DISKANN_TARGET
|
||||
delocate-wheel -w dist_repaired -v --require-target-macos-version $DISKANN_TARGET dist/*.whl
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
fi
|
||||
@@ -249,8 +288,8 @@ jobs:
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate || source .venv/Scripts/activate
|
||||
|
||||
# Run all tests
|
||||
pytest tests/
|
||||
# Run tests
|
||||
pytest -v tests/
|
||||
|
||||
- name: Run sanity checks (optional)
|
||||
run: |
|
||||
|
||||
@@ -71,6 +71,8 @@ source .venv/bin/activate
|
||||
uv pip install leann
|
||||
```
|
||||
|
||||
> Low-resource? See “Low-resource setups” in the [Configuration Guide](docs/configuration-guide.md#low-resource-setups).
|
||||
|
||||
<details>
|
||||
<summary>
|
||||
<strong>🔧 Build from Source (Recommended for development)</strong>
|
||||
@@ -468,7 +470,7 @@ leann --help
|
||||
### Usage Examples
|
||||
|
||||
```bash
|
||||
# build from a specific directory, and my_docs is the index name
|
||||
# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
|
||||
leann build my-docs --docs ./your_documents
|
||||
|
||||
# Search your documents
|
||||
|
||||
@@ -259,24 +259,80 @@ Every configuration choice involves trade-offs:
|
||||
|
||||
The key is finding the right balance for your specific use case. Start small and simple, measure performance, then scale up only where needed.
|
||||
|
||||
## Deep Dive: Critical Configuration Decisions
|
||||
## Low-resource setups
|
||||
|
||||
### When to Disable Recomputation
|
||||
If you don’t have a local GPU or builds/searches are too slow, use one or more of the options below.
|
||||
|
||||
LEANN's recomputation feature provides exact distance calculations but can be disabled for extreme QPS requirements:
|
||||
### 1) Use OpenAI embeddings (no local compute)
|
||||
|
||||
Fastest path with zero local GPU requirements. Set your API key and use OpenAI embeddings during build and search:
|
||||
|
||||
```bash
|
||||
--no-recompute # Disable selective recomputation
|
||||
export OPENAI_API_KEY=sk-...
|
||||
|
||||
# Build with OpenAI embeddings
|
||||
leann build my-index \
|
||||
--embedding-mode openai \
|
||||
--embedding-model text-embedding-3-small
|
||||
|
||||
# Search with OpenAI embeddings (recompute at query time)
|
||||
leann search my-index "your query" \
|
||||
--recompute-embeddings
|
||||
```
|
||||
|
||||
**Trade-offs**:
|
||||
- **With recomputation** (default): Exact distances, best quality, higher latency, minimal storage (only stores metadata, recomputes embeddings on-demand)
|
||||
- **Without recomputation**: Must store full embeddings, significantly higher memory and storage usage (10-100x more), but faster search
|
||||
### 2) Run remote builds with SkyPilot (cloud GPU)
|
||||
|
||||
**Disable when**:
|
||||
- You have abundant storage and memory
|
||||
- Need extremely low latency (< 100ms)
|
||||
- Running a read-heavy workload where storage cost is acceptable
|
||||
Offload embedding generation and index building to a GPU VM using SkyPilot. A template is provided at `sky/leann-build.yaml`.
|
||||
|
||||
```bash
|
||||
# One-time: install and configure SkyPilot
|
||||
pip install skypilot
|
||||
sky launch -c leann-gpu sky/leann-build.yaml
|
||||
|
||||
# Build remotely (template installs uv + leann CLI)
|
||||
sky exec leann-gpu -- "leann build my-index --docs ~/leann-data --backend hnsw --complexity 64 --graph-degree 32"
|
||||
```
|
||||
|
||||
Details: see “Running Builds on SkyPilot (Optional)” below.
|
||||
|
||||
### 3) Disable recomputation to trade storage for speed
|
||||
|
||||
If you need lower latency and have more storage/memory, disable recomputation. This stores full embeddings and avoids recomputing at search time.
|
||||
|
||||
```bash
|
||||
# Build without recomputation (HNSW requires non-compact in this mode)
|
||||
leann build my-index --no-recompute --no-compact
|
||||
|
||||
# Search without recomputation
|
||||
leann search my-index "your query" --no-recompute
|
||||
```
|
||||
|
||||
Trade-offs: lower query-time latency, but significantly higher storage usage.
|
||||
|
||||
## Running Builds on SkyPilot (Optional)
|
||||
|
||||
You can offload embedding generation and index building to a cloud GPU VM using SkyPilot, without changing any LEANN code. This is useful when your local machine lacks a GPU or you want faster throughput.
|
||||
|
||||
### Quick Start
|
||||
|
||||
1) Install SkyPilot by following their docs (`pip install skypilot`), then configure cloud credentials.
|
||||
|
||||
2) Use the provided SkyPilot template:
|
||||
|
||||
```bash
|
||||
sky launch -c leann-gpu sky/leann-build.yaml
|
||||
```
|
||||
|
||||
3) On the remote, either put your data under the mounted path or adjust `file_mounts` in `sky/leann-build.yaml`. Then run the LEANN build:
|
||||
|
||||
```bash
|
||||
sky exec leann-gpu -- "leann build my-index --docs ~/leann-data --backend hnsw --complexity 64 --graph-degree 32"
|
||||
```
|
||||
|
||||
Notes:
|
||||
- The template installs `uv` and the `leann` CLI globally on the remote instance.
|
||||
- Change the `accelerators` and `cloud` settings in `sky/leann-build.yaml` to match your budget/availability (e.g., `A10G:1`, `A100:1`, or CPU-only if you prefer).
|
||||
- You can also build with `diskann` by switching `--backend diskann`.
|
||||
|
||||
## Further Reading
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.2.7"
|
||||
dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]
|
||||
version = "0.2.9"
|
||||
dependencies = ["leann-core==0.2.9", "numpy", "protobuf>=3.19.0"]
|
||||
|
||||
[tool.scikit-build]
|
||||
# Key: simplified CMake path
|
||||
|
||||
@@ -13,7 +13,7 @@ if(APPLE)
|
||||
else()
|
||||
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
|
||||
endif()
|
||||
|
||||
|
||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_C_LIB_NAMES "omp")
|
||||
|
||||
@@ -95,6 +95,8 @@ def create_hnsw_embedding_server(
|
||||
passage_sources.append(source_copy)
|
||||
|
||||
passages = PassageManager(passage_sources)
|
||||
# Use index dimensions from metadata for shaping fallback responses
|
||||
embedding_dim: int = int(meta.get("dimensions", 0))
|
||||
logger.info(
|
||||
f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
|
||||
)
|
||||
@@ -109,6 +111,9 @@ def create_hnsw_embedding_server(
|
||||
socket.setsockopt(zmq.RCVTIMEO, 300000)
|
||||
socket.setsockopt(zmq.SNDTIMEO, 300000)
|
||||
|
||||
# Track last request type for safe fallback responses on exceptions
|
||||
last_request_type = "unknown" # one of: 'text', 'distance', 'embedding', 'unknown'
|
||||
last_request_length = 0
|
||||
while True:
|
||||
try:
|
||||
message_bytes = socket.recv()
|
||||
@@ -121,6 +126,8 @@ def create_hnsw_embedding_server(
|
||||
if isinstance(request_payload, list) and len(request_payload) > 0:
|
||||
# Check if this is a direct text request (list of strings)
|
||||
if all(isinstance(item, str) for item in request_payload):
|
||||
last_request_type = "text"
|
||||
last_request_length = len(request_payload)
|
||||
logger.info(
|
||||
f"Processing direct text embedding request for {len(request_payload)} texts in {embedding_mode} mode"
|
||||
)
|
||||
@@ -145,43 +152,66 @@ def create_hnsw_embedding_server(
|
||||
):
|
||||
node_ids = request_payload[0]
|
||||
query_vector = np.array(request_payload[1], dtype=np.float32)
|
||||
last_request_type = "distance"
|
||||
last_request_length = len(node_ids)
|
||||
|
||||
logger.debug("Distance calculation request received")
|
||||
logger.debug(f" Node IDs: {node_ids}")
|
||||
logger.debug(f" Query vector dim: {len(query_vector)}")
|
||||
|
||||
# Get embeddings for node IDs
|
||||
texts = []
|
||||
for nid in node_ids:
|
||||
# Get embeddings for node IDs, tolerate missing IDs
|
||||
texts: list[str] = []
|
||||
found_indices: list[int] = []
|
||||
for idx, nid in enumerate(node_ids):
|
||||
try:
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data["text"]
|
||||
texts.append(txt)
|
||||
txt = passage_data.get("text", "")
|
||||
if isinstance(txt, str) and len(txt) > 0:
|
||||
texts.append(txt)
|
||||
found_indices.append(idx)
|
||||
else:
|
||||
logger.error(f"Empty text for passage ID {nid}")
|
||||
except KeyError:
|
||||
logger.error(f"Passage ID {nid} not found")
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Exception looking up passage ID {nid}: {e}")
|
||||
raise
|
||||
|
||||
# Process embeddings
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
# Prepare full-length response distances with safe fallbacks
|
||||
large_distance = 1e9
|
||||
response_distances = [large_distance] * len(node_ids)
|
||||
|
||||
if texts:
|
||||
try:
|
||||
# Process embeddings only for found indices
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
|
||||
# Calculate distances for found embeddings only
|
||||
if distance_metric == "l2":
|
||||
partial_distances = np.sum(
|
||||
np.square(embeddings - query_vector.reshape(1, -1)), axis=1
|
||||
)
|
||||
else: # mips or cosine
|
||||
partial_distances = -np.dot(embeddings, query_vector)
|
||||
|
||||
# Place computed distances back into the full response array
|
||||
for pos, dval in zip(
|
||||
found_indices, partial_distances.flatten().tolist()
|
||||
):
|
||||
response_distances[pos] = float(dval)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Distance computation error, falling back to large distances: {e}"
|
||||
)
|
||||
|
||||
# Always reply with exactly len(node_ids) distances
|
||||
response_bytes = msgpack.packb([response_distances], use_single_float=True)
|
||||
logger.debug(
|
||||
f"Sending distance response with {len(response_distances)} distances (found={len(found_indices)})"
|
||||
)
|
||||
|
||||
# Calculate distances
|
||||
if distance_metric == "l2":
|
||||
distances = np.sum(
|
||||
np.square(embeddings - query_vector.reshape(1, -1)), axis=1
|
||||
)
|
||||
else: # mips or cosine
|
||||
distances = -np.dot(embeddings, query_vector)
|
||||
|
||||
response_payload = distances.flatten().tolist()
|
||||
response_bytes = msgpack.packb([response_payload], use_single_float=True)
|
||||
logger.debug(f"Sending distance response with {len(distances)} distances")
|
||||
|
||||
socket.send(response_bytes)
|
||||
e2e_end = time.time()
|
||||
logger.info(f"⏱️ Distance calculation E2E time: {e2e_end - e2e_start:.6f}s")
|
||||
@@ -201,40 +231,61 @@ def create_hnsw_embedding_server(
|
||||
|
||||
node_ids = request_payload[0]
|
||||
logger.debug(f"Request for {len(node_ids)} node embeddings")
|
||||
last_request_type = "embedding"
|
||||
last_request_length = len(node_ids)
|
||||
|
||||
# Look up texts by node IDs
|
||||
texts = []
|
||||
for nid in node_ids:
|
||||
# Allocate output buffer (B, D) and fill with zeros for robustness
|
||||
if embedding_dim <= 0:
|
||||
logger.error("Embedding dimension unknown; cannot serve embedding request")
|
||||
dims = [0, 0]
|
||||
data = []
|
||||
else:
|
||||
dims = [len(node_ids), embedding_dim]
|
||||
data = [0.0] * (dims[0] * dims[1])
|
||||
|
||||
# Look up texts by node IDs; compute embeddings where available
|
||||
texts: list[str] = []
|
||||
found_indices: list[int] = []
|
||||
for idx, nid in enumerate(node_ids):
|
||||
try:
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data["text"]
|
||||
if not txt:
|
||||
raise RuntimeError(f"FATAL: Empty text for passage ID {nid}")
|
||||
texts.append(txt)
|
||||
txt = passage_data.get("text", "")
|
||||
if isinstance(txt, str) and len(txt) > 0:
|
||||
texts.append(txt)
|
||||
found_indices.append(idx)
|
||||
else:
|
||||
logger.error(f"Empty text for passage ID {nid}")
|
||||
except KeyError:
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
logger.error(f"Passage with ID {nid} not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Exception looking up passage ID {nid}: {e}")
|
||||
raise
|
||||
|
||||
# Process embeddings
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
if texts:
|
||||
try:
|
||||
# Process embeddings for found texts only
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
|
||||
# Serialization and response
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
logger.error(
|
||||
f"NaN or Inf detected in embeddings! Requested IDs: {node_ids[:5]}..."
|
||||
)
|
||||
raise AssertionError()
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
logger.error(
|
||||
f"NaN or Inf detected in embeddings! Requested IDs: {node_ids[:5]}..."
|
||||
)
|
||||
dims = [0, embedding_dim]
|
||||
data = []
|
||||
else:
|
||||
# Copy computed embeddings into the correct positions
|
||||
emb_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
flat = emb_f32.flatten().tolist()
|
||||
for j, pos in enumerate(found_indices):
|
||||
start = pos * embedding_dim
|
||||
end = start + embedding_dim
|
||||
data[start:end] = flat[j * embedding_dim : (j + 1) * embedding_dim]
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding computation error, returning zeros: {e}")
|
||||
|
||||
hidden_contiguous_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
response_payload = [
|
||||
list(hidden_contiguous_f32.shape),
|
||||
hidden_contiguous_f32.flatten().tolist(),
|
||||
]
|
||||
response_payload = [dims, data]
|
||||
response_bytes = msgpack.packb(response_payload, use_single_float=True)
|
||||
|
||||
socket.send(response_bytes)
|
||||
@@ -249,7 +300,22 @@ def create_hnsw_embedding_server(
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
socket.send(msgpack.packb([[], []]))
|
||||
# Fallback to a safe, minimal-structure response to avoid client crashes
|
||||
if last_request_type == "distance":
|
||||
# Return a vector of large distances with the expected length
|
||||
fallback_len = max(0, int(last_request_length))
|
||||
large_distance = 1e9
|
||||
safe_response = [[large_distance] * fallback_len]
|
||||
elif last_request_type == "embedding":
|
||||
# Return an empty embedding block with known dimension if available
|
||||
if embedding_dim > 0:
|
||||
safe_response = [[0, embedding_dim], []]
|
||||
else:
|
||||
safe_response = [[0, 0], []]
|
||||
else:
|
||||
# Unknown request type: default to empty embedding structure
|
||||
safe_response = [[0, int(embedding_dim) if embedding_dim > 0 else 0], []]
|
||||
socket.send(msgpack.packb(safe_response, use_single_float=True))
|
||||
|
||||
zmq_thread = threading.Thread(target=zmq_server_thread, daemon=True)
|
||||
zmq_thread.start()
|
||||
|
||||
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.2.7"
|
||||
version = "0.2.9"
|
||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||
dependencies = [
|
||||
"leann-core==0.2.7",
|
||||
"leann-core==0.2.9",
|
||||
"numpy",
|
||||
"pyzmq>=23.0.0",
|
||||
"msgpack>=1.0.0",
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann-core"
|
||||
version = "0.2.7"
|
||||
version = "0.2.9"
|
||||
description = "Core API and plugin system for LEANN"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import Union
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from tqdm import tqdm
|
||||
|
||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||
|
||||
@@ -75,11 +76,14 @@ class LeannCLI:
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
leann build my-docs --docs ./documents # Build index named my-docs
|
||||
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||
leann search my-docs "query" # Search in my-docs index
|
||||
leann ask my-docs "question" # Ask my-docs index
|
||||
leann list # List all stored indexes
|
||||
leann build my-docs --docs ./documents # Build index from directory
|
||||
leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
|
||||
leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
|
||||
leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
|
||||
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||
leann search my-docs "query" # Search in my-docs index
|
||||
leann ask my-docs "question" # Ask my-docs index
|
||||
leann list # List all stored indexes
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -91,7 +95,11 @@ Examples:
|
||||
"index_name", nargs="?", help="Index name (default: current directory name)"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
|
||||
"--docs",
|
||||
type=str,
|
||||
nargs="+",
|
||||
default=["."],
|
||||
help="Documents directories and/or files (default: current directory)",
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
||||
@@ -109,7 +117,19 @@ Examples:
|
||||
build_parser.add_argument("--complexity", type=int, default=64)
|
||||
build_parser.add_argument("--num-threads", type=int, default=1)
|
||||
build_parser.add_argument("--compact", action="store_true", default=True)
|
||||
build_parser.add_argument(
|
||||
"--no-compact",
|
||||
dest="compact",
|
||||
action="store_false",
|
||||
help="Disable compact index storage (store full embeddings; higher storage)",
|
||||
)
|
||||
build_parser.add_argument("--recompute", action="store_true", default=True)
|
||||
build_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation (store full embeddings; lower query latency)",
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--file-types",
|
||||
type=str,
|
||||
@@ -130,6 +150,18 @@ Examples:
|
||||
default=True,
|
||||
help="Recompute embeddings (default: True)",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--no-recompute-embeddings",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation during search",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Alias for --no-recompute-embeddings",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--pruning-strategy",
|
||||
choices=["global", "local", "proportional"],
|
||||
@@ -158,6 +190,18 @@ Examples:
|
||||
default=True,
|
||||
help="Recompute embeddings (default: True)",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--no-recompute-embeddings",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation during ask",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Alias for --no-recompute-embeddings",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--pruning-strategy",
|
||||
choices=["global", "local", "proportional"],
|
||||
@@ -235,6 +279,32 @@ Examples:
|
||||
"""Check if a file should be excluded using gitignore parser."""
|
||||
return gitignore_matches(str(relative_path))
|
||||
|
||||
def _is_git_submodule(self, path: Path) -> bool:
|
||||
"""Check if a path is a git submodule."""
|
||||
try:
|
||||
# Find the git repo root
|
||||
current_dir = Path.cwd()
|
||||
while current_dir != current_dir.parent:
|
||||
if (current_dir / ".git").exists():
|
||||
gitmodules_path = current_dir / ".gitmodules"
|
||||
if gitmodules_path.exists():
|
||||
# Read .gitmodules to check if this path is a submodule
|
||||
gitmodules_content = gitmodules_path.read_text()
|
||||
# Convert path to relative to git root
|
||||
try:
|
||||
relative_path = path.resolve().relative_to(current_dir)
|
||||
# Check if this path appears in .gitmodules
|
||||
return f"path = {relative_path}" in gitmodules_content
|
||||
except ValueError:
|
||||
# Path is not under git root
|
||||
return False
|
||||
break
|
||||
current_dir = current_dir.parent
|
||||
return False
|
||||
except Exception:
|
||||
# If anything goes wrong, assume it's not a submodule
|
||||
return False
|
||||
|
||||
def list_indexes(self):
|
||||
print("Stored LEANN indexes:")
|
||||
|
||||
@@ -264,7 +334,9 @@ Examples:
|
||||
valid_projects.append(current_path)
|
||||
|
||||
if not valid_projects:
|
||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
||||
print(
|
||||
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
|
||||
)
|
||||
return
|
||||
|
||||
total_indexes = 0
|
||||
@@ -311,56 +383,88 @@ Examples:
|
||||
print(f' leann search {example_name} "your query"')
|
||||
print(f" leann ask {example_name} --interactive")
|
||||
|
||||
def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None):
|
||||
print(f"Loading documents from {docs_dir}...")
|
||||
def load_documents(
|
||||
self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
|
||||
):
|
||||
# Handle both single path (string) and multiple paths (list) for backward compatibility
|
||||
if isinstance(docs_paths, str):
|
||||
docs_paths = [docs_paths]
|
||||
|
||||
# Separate files and directories
|
||||
files = []
|
||||
directories = []
|
||||
for path in docs_paths:
|
||||
path_obj = Path(path)
|
||||
if path_obj.is_file():
|
||||
files.append(str(path_obj))
|
||||
elif path_obj.is_dir():
|
||||
# Check if this is a git submodule - if so, skip it
|
||||
if self._is_git_submodule(path_obj):
|
||||
print(f"⚠️ Skipping git submodule: {path}")
|
||||
continue
|
||||
directories.append(str(path_obj))
|
||||
else:
|
||||
print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
|
||||
continue
|
||||
|
||||
# Print summary of what we're processing
|
||||
total_items = len(files) + len(directories)
|
||||
items_desc = []
|
||||
if files:
|
||||
items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
|
||||
if directories:
|
||||
items_desc.append(
|
||||
f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
|
||||
)
|
||||
|
||||
print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
|
||||
if files:
|
||||
print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
|
||||
if directories:
|
||||
print(f" 📁 Directories: {', '.join(directories)}")
|
||||
|
||||
if custom_file_types:
|
||||
print(f"Using custom file types: {custom_file_types}")
|
||||
|
||||
# Build gitignore parser
|
||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||
all_documents = []
|
||||
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
# First, process individual files if any
|
||||
if files:
|
||||
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
|
||||
|
||||
# Check if we should process PDFs
|
||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||
# Load individual files using SimpleDirectoryReader with input_files
|
||||
# Note: We skip gitignore filtering for explicitly specified files
|
||||
try:
|
||||
# Group files by their parent directory for efficient loading
|
||||
from collections import defaultdict
|
||||
|
||||
if should_process_pdfs:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||
continue
|
||||
files_by_dir = defaultdict(list)
|
||||
for file_path in files:
|
||||
parent_dir = str(Path(file_path).parent)
|
||||
files_by_dir[parent_dir].append(file_path)
|
||||
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
# Load files from each parent directory
|
||||
for parent_dir, file_list in files_by_dir.items():
|
||||
print(
|
||||
f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
|
||||
)
|
||||
try:
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
file_docs = SimpleDirectoryReader(
|
||||
parent_dir,
|
||||
input_files=file_list,
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
all_documents.extend(file_docs)
|
||||
print(
|
||||
f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process {file_path}: {e}")
|
||||
print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
|
||||
|
||||
# Load other file types with default reader
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing individual files: {e}")
|
||||
|
||||
# Define file extensions to process
|
||||
if custom_file_types:
|
||||
# Parse custom file types from comma-separated string
|
||||
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
||||
@@ -422,41 +526,106 @@ Examples:
|
||||
".py",
|
||||
".jl",
|
||||
]
|
||||
# Try to load other file types, but don't fail if none are found
|
||||
try:
|
||||
# Create a custom file filter function using our PathSpec
|
||||
def file_filter(file_path: str) -> bool:
|
||||
"""Return True if file should be included (not excluded)"""
|
||||
try:
|
||||
docs_path_obj = Path(docs_dir)
|
||||
file_path_obj = Path(file_path)
|
||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||
except (ValueError, OSError):
|
||||
return True # Include files that can't be processed
|
||||
|
||||
other_docs = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
file_extractor={}, # Use default extractors
|
||||
filename_as_id=True,
|
||||
).load_data(show_progress=True)
|
||||
# Process each directory
|
||||
if directories:
|
||||
print(
|
||||
f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
|
||||
)
|
||||
|
||||
# Filter documents after loading based on gitignore rules
|
||||
filtered_docs = []
|
||||
for doc in other_docs:
|
||||
file_path = doc.metadata.get("file_path", "")
|
||||
if file_filter(file_path):
|
||||
filtered_docs.append(doc)
|
||||
for docs_dir in directories:
|
||||
print(f"Processing directory: {docs_dir}")
|
||||
# Build gitignore parser for each directory
|
||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||
|
||||
documents.extend(filtered_docs)
|
||||
except ValueError as e:
|
||||
if "No files found" in str(e):
|
||||
print("No additional files found for other supported types.")
|
||||
else:
|
||||
raise e
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
|
||||
# Check if we should process PDFs
|
||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||
|
||||
if should_process_pdfs:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
try:
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||
continue
|
||||
except ValueError:
|
||||
# Skip files that can't be made relative to docs_path
|
||||
print(f"⚠️ Skipping file outside directory scope: {file_path}")
|
||||
continue
|
||||
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
try:
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process {file_path}: {e}")
|
||||
|
||||
# Load other file types with default reader
|
||||
try:
|
||||
# Create a custom file filter function using our PathSpec
|
||||
def file_filter(
|
||||
file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
|
||||
) -> bool:
|
||||
"""Return True if file should be included (not excluded)"""
|
||||
try:
|
||||
docs_path_obj = Path(docs_dir)
|
||||
file_path_obj = Path(file_path)
|
||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||
except (ValueError, OSError):
|
||||
return True # Include files that can't be processed
|
||||
|
||||
other_docs = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
file_extractor={}, # Use default extractors
|
||||
filename_as_id=True,
|
||||
).load_data(show_progress=True)
|
||||
|
||||
# Filter documents after loading based on gitignore rules
|
||||
filtered_docs = []
|
||||
for doc in other_docs:
|
||||
file_path = doc.metadata.get("file_path", "")
|
||||
if file_filter(file_path):
|
||||
filtered_docs.append(doc)
|
||||
|
||||
documents.extend(filtered_docs)
|
||||
except ValueError as e:
|
||||
if "No files found" in str(e):
|
||||
print(f"No additional files found for other supported types in {docs_dir}.")
|
||||
else:
|
||||
raise e
|
||||
|
||||
all_documents.extend(documents)
|
||||
print(f"Loaded {len(documents)} documents from {docs_dir}")
|
||||
|
||||
documents = all_documents
|
||||
|
||||
all_texts = []
|
||||
|
||||
@@ -507,7 +676,9 @@ Examples:
|
||||
".jl",
|
||||
}
|
||||
|
||||
for doc in documents:
|
||||
print("start chunking documents")
|
||||
# Add progress bar for document chunking
|
||||
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
|
||||
# Check if this is a code file based on source path
|
||||
source_path = doc.metadata.get("source", "")
|
||||
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
|
||||
@@ -523,7 +694,7 @@ Examples:
|
||||
return all_texts
|
||||
|
||||
async def build_index(self, args):
|
||||
docs_dir = args.docs
|
||||
docs_paths = args.docs
|
||||
# Use current directory name if index_name not provided
|
||||
if args.index_name:
|
||||
index_name = args.index_name
|
||||
@@ -534,13 +705,25 @@ Examples:
|
||||
index_dir = self.indexes_dir / index_name
|
||||
index_path = self.get_index_path(index_name)
|
||||
|
||||
print(f"📂 Indexing: {Path(docs_dir).resolve()}")
|
||||
# Display all paths being indexed with file/directory distinction
|
||||
files = [p for p in docs_paths if Path(p).is_file()]
|
||||
directories = [p for p in docs_paths if Path(p).is_dir()]
|
||||
|
||||
print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
|
||||
if files:
|
||||
print(f" 📄 Files ({len(files)}):")
|
||||
for i, file_path in enumerate(files, 1):
|
||||
print(f" {i}. {Path(file_path).resolve()}")
|
||||
if directories:
|
||||
print(f" 📁 Directories ({len(directories)}):")
|
||||
for i, dir_path in enumerate(directories, 1):
|
||||
print(f" {i}. {Path(dir_path).resolve()}")
|
||||
|
||||
if index_dir.exists() and not args.force:
|
||||
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
||||
return
|
||||
|
||||
all_texts = self.load_documents(docs_dir, args.file_types)
|
||||
all_texts = self.load_documents(docs_paths, args.file_types)
|
||||
if not all_texts:
|
||||
print("No documents found")
|
||||
return
|
||||
@@ -576,7 +759,7 @@ Examples:
|
||||
|
||||
if not self.index_exists(index_name):
|
||||
print(
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||
)
|
||||
return
|
||||
|
||||
@@ -603,7 +786,7 @@ Examples:
|
||||
|
||||
if not self.index_exists(index_name):
|
||||
print(
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ Preserves all optimization parameters to ensure performance
|
||||
|
||||
import logging
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -374,7 +373,9 @@ def compute_embeddings_ollama(
|
||||
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute embeddings using Ollama API.
|
||||
Compute embeddings using Ollama API with simplified batch processing.
|
||||
|
||||
Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
|
||||
|
||||
Args:
|
||||
texts: List of texts to compute embeddings for
|
||||
@@ -438,12 +439,19 @@ def compute_embeddings_ollama(
|
||||
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
|
||||
embedding_models.append(model)
|
||||
|
||||
# Check if model exists (handle versioned names)
|
||||
model_found = any(
|
||||
model_name == name.split(":")[0] or model_name == name for name in model_names
|
||||
)
|
||||
# Check if model exists (handle versioned names) and resolve to full name
|
||||
resolved_model_name = None
|
||||
for name in model_names:
|
||||
# Exact match
|
||||
if model_name == name:
|
||||
resolved_model_name = name
|
||||
break
|
||||
# Match without version tag (use the versioned name)
|
||||
elif model_name == name.split(":")[0]:
|
||||
resolved_model_name = name
|
||||
break
|
||||
|
||||
if not model_found:
|
||||
if not resolved_model_name:
|
||||
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
|
||||
|
||||
# Suggest pulling the model
|
||||
@@ -465,6 +473,11 @@ def compute_embeddings_ollama(
|
||||
error_msg += "\n📚 Browse more: https://ollama.com/library"
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Use the resolved model name for all subsequent operations
|
||||
if resolved_model_name != model_name:
|
||||
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
|
||||
model_name = resolved_model_name
|
||||
|
||||
# Verify the model supports embeddings by testing it
|
||||
try:
|
||||
test_response = requests.post(
|
||||
@@ -485,162 +498,147 @@ def compute_embeddings_ollama(
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Could not verify model existence: {e}")
|
||||
|
||||
# Process embeddings with optimized concurrent processing
|
||||
import requests
|
||||
# Determine batch size based on device availability
|
||||
# Check for CUDA/MPS availability using torch if available
|
||||
batch_size = 32 # Default for MPS/CPU
|
||||
try:
|
||||
import torch
|
||||
|
||||
def get_single_embedding(text_idx_tuple):
|
||||
"""Helper function to get embedding for a single text."""
|
||||
text, idx = text_idx_tuple
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
if torch.cuda.is_available():
|
||||
batch_size = 128 # CUDA gets larger batch size
|
||||
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||
batch_size = 32 # MPS gets smaller batch size
|
||||
except ImportError:
|
||||
# If torch is not available, use conservative batch size
|
||||
batch_size = 32
|
||||
|
||||
# Truncate very long texts to avoid API issues
|
||||
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||
logger.info(f"Using batch size: {batch_size}")
|
||||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{host}/api/embeddings",
|
||||
json={"model": model_name, "prompt": truncated_text},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
def get_batch_embeddings(batch_texts):
|
||||
"""Get embeddings for a batch of texts."""
|
||||
all_embeddings = []
|
||||
failed_indices = []
|
||||
|
||||
result = response.json()
|
||||
embedding = result.get("embedding")
|
||||
for i, text in enumerate(batch_texts):
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
|
||||
if embedding is None:
|
||||
raise ValueError(f"No embedding returned for text {idx}")
|
||||
|
||||
return idx, embedding
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.warning(f"Timeout for text {idx} after {max_retries} retries")
|
||||
return idx, None
|
||||
|
||||
except Exception as e:
|
||||
if retry_count >= max_retries - 1:
|
||||
logger.error(f"Failed to get embedding for text {idx}: {e}")
|
||||
return idx, None
|
||||
retry_count += 1
|
||||
|
||||
return idx, None
|
||||
|
||||
# Determine if we should use concurrent processing
|
||||
use_concurrent = (
|
||||
len(texts) > 5 and not is_build
|
||||
) # Don't use concurrent in build mode to avoid overwhelming
|
||||
max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama
|
||||
|
||||
all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order
|
||||
failed_indices = []
|
||||
|
||||
if use_concurrent:
|
||||
logger.info(
|
||||
f"Using concurrent processing with {max_workers} workers for {len(texts)} texts"
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit all tasks
|
||||
future_to_idx = {
|
||||
executor.submit(get_single_embedding, (text, idx)): idx
|
||||
for idx, text in enumerate(texts)
|
||||
}
|
||||
|
||||
# Add progress bar for concurrent processing
|
||||
try:
|
||||
if is_build or len(texts) > 10:
|
||||
from tqdm import tqdm
|
||||
|
||||
futures_iterator = tqdm(
|
||||
as_completed(future_to_idx),
|
||||
total=len(texts),
|
||||
desc="Computing Ollama embeddings",
|
||||
)
|
||||
else:
|
||||
futures_iterator = as_completed(future_to_idx)
|
||||
except ImportError:
|
||||
futures_iterator = as_completed(future_to_idx)
|
||||
|
||||
# Collect results as they complete
|
||||
for future in futures_iterator:
|
||||
# Truncate very long texts to avoid API issues
|
||||
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
idx, embedding = future.result()
|
||||
if embedding is not None:
|
||||
all_embeddings[idx] = embedding
|
||||
else:
|
||||
failed_indices.append(idx)
|
||||
response = requests.post(
|
||||
f"{host}/api/embeddings",
|
||||
json={"model": model_name, "prompt": truncated_text},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
embedding = result.get("embedding")
|
||||
|
||||
if embedding is None:
|
||||
raise ValueError(f"No embedding returned for text {i}")
|
||||
|
||||
if not isinstance(embedding, list) or len(embedding) == 0:
|
||||
raise ValueError(f"Invalid embedding format for text {i}")
|
||||
|
||||
all_embeddings.append(embedding)
|
||||
break
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.warning(f"Timeout for text {i} after {max_retries} retries")
|
||||
failed_indices.append(i)
|
||||
all_embeddings.append(None)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
idx = future_to_idx[future]
|
||||
logger.error(f"Exception for text {idx}: {e}")
|
||||
failed_indices.append(idx)
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.error(f"Failed to get embedding for text {i}: {e}")
|
||||
failed_indices.append(i)
|
||||
all_embeddings.append(None)
|
||||
break
|
||||
return all_embeddings, failed_indices
|
||||
|
||||
# Process texts in batches
|
||||
all_embeddings = []
|
||||
all_failed_indices = []
|
||||
|
||||
# Setup progress bar if needed
|
||||
show_progress = is_build or len(texts) > 10
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm
|
||||
except ImportError:
|
||||
show_progress = False
|
||||
|
||||
# Process batches
|
||||
num_batches = (len(texts) + batch_size - 1) // batch_size
|
||||
|
||||
if show_progress:
|
||||
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
|
||||
else:
|
||||
# Sequential processing with progress bar
|
||||
show_progress = is_build or len(texts) > 10
|
||||
batch_iterator = range(num_batches)
|
||||
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm
|
||||
for batch_idx in batch_iterator:
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, len(texts))
|
||||
batch_texts = texts[start_idx:end_idx]
|
||||
|
||||
iterator = tqdm(
|
||||
enumerate(texts), total=len(texts), desc="Computing Ollama embeddings"
|
||||
)
|
||||
else:
|
||||
iterator = enumerate(texts)
|
||||
except ImportError:
|
||||
iterator = enumerate(texts)
|
||||
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
|
||||
|
||||
for idx, text in iterator:
|
||||
result_idx, embedding = get_single_embedding((text, idx))
|
||||
if embedding is not None:
|
||||
all_embeddings[idx] = embedding
|
||||
else:
|
||||
failed_indices.append(idx)
|
||||
# Adjust failed indices to global indices
|
||||
global_failed = [start_idx + idx for idx in batch_failed]
|
||||
all_failed_indices.extend(global_failed)
|
||||
all_embeddings.extend(batch_embeddings)
|
||||
|
||||
# Handle failed embeddings
|
||||
if failed_indices:
|
||||
if len(failed_indices) == len(texts):
|
||||
if all_failed_indices:
|
||||
if len(all_failed_indices) == len(texts):
|
||||
raise RuntimeError("Failed to compute any embeddings")
|
||||
|
||||
logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts")
|
||||
logger.warning(
|
||||
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
|
||||
)
|
||||
|
||||
# Use zero embeddings as fallback for failed ones
|
||||
valid_embedding = next((e for e in all_embeddings if e is not None), None)
|
||||
if valid_embedding:
|
||||
embedding_dim = len(valid_embedding)
|
||||
for idx in failed_indices:
|
||||
all_embeddings[idx] = [0.0] * embedding_dim
|
||||
for i, embedding in enumerate(all_embeddings):
|
||||
if embedding is None:
|
||||
all_embeddings[i] = [0.0] * embedding_dim
|
||||
|
||||
# Remove None values and convert to numpy array
|
||||
# Remove None values
|
||||
all_embeddings = [e for e in all_embeddings if e is not None]
|
||||
|
||||
# Validate embedding dimensions before creating numpy array
|
||||
if all_embeddings:
|
||||
expected_dim = len(all_embeddings[0])
|
||||
inconsistent_dims = []
|
||||
for i, embedding in enumerate(all_embeddings):
|
||||
if len(embedding) != expected_dim:
|
||||
inconsistent_dims.append((i, len(embedding)))
|
||||
if not all_embeddings:
|
||||
raise RuntimeError("No valid embeddings were computed")
|
||||
|
||||
if inconsistent_dims:
|
||||
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
||||
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
||||
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
||||
if len(inconsistent_dims) > 10:
|
||||
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
||||
error_msg += (
|
||||
f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
||||
)
|
||||
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
||||
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
||||
error_msg += (
|
||||
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
||||
)
|
||||
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
||||
raise ValueError(error_msg)
|
||||
# Validate embedding dimensions
|
||||
expected_dim = len(all_embeddings[0])
|
||||
inconsistent_dims = []
|
||||
for i, embedding in enumerate(all_embeddings):
|
||||
if len(embedding) != expected_dim:
|
||||
inconsistent_dims.append((i, len(embedding)))
|
||||
|
||||
if inconsistent_dims:
|
||||
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
||||
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
||||
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
||||
if len(inconsistent_dims) > 10:
|
||||
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
||||
error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
||||
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
||||
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
||||
error_msg += (
|
||||
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
||||
)
|
||||
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Convert to numpy array and normalize
|
||||
embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||
|
||||
@@ -4,20 +4,12 @@ Transform your development workflow with intelligent code assistance using LEANN
|
||||
|
||||
## Prerequisites
|
||||
|
||||
**Step 1:** First, complete the basic LEANN installation following the [📦 Installation guide](../../README.md#installation) in the root README:
|
||||
Install LEANN globally for MCP integration (with default backend):
|
||||
|
||||
```bash
|
||||
uv venv
|
||||
source .venv/bin/activate
|
||||
uv pip install leann
|
||||
uv tool install leann-core --with leann
|
||||
```
|
||||
|
||||
**Step 2:** Install LEANN globally for MCP integration:
|
||||
```bash
|
||||
uv tool install leann-core
|
||||
```
|
||||
|
||||
This makes the `leann` command available system-wide, which `leann_mcp` requires.
|
||||
This installs the `leann` CLI into an isolated tool environment and includes both backends so `leann build` works out-of-the-box.
|
||||
|
||||
## 🚀 Quick Setup
|
||||
|
||||
@@ -45,6 +37,42 @@ leann build my-project --docs ./
|
||||
claude
|
||||
```
|
||||
|
||||
## 🚀 Advanced Usage Examples
|
||||
|
||||
### Index Entire Git Repository
|
||||
```bash
|
||||
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
|
||||
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index only specific file types from git
|
||||
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
### Multiple Directories and Files
|
||||
```bash
|
||||
# Index multiple directories
|
||||
leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Mix files and directories
|
||||
leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Specific files only
|
||||
leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
### Advanced Git Integration
|
||||
```bash
|
||||
# Index recently modified files
|
||||
leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index files matching pattern
|
||||
leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index documentation and config files
|
||||
leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
|
||||
**Try this in Claude Code:**
|
||||
```
|
||||
Help me understand this codebase. List available indexes and search for authentication patterns.
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann"
|
||||
version = "0.2.7"
|
||||
version = "0.2.9"
|
||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
62
sky/leann-build.yaml
Normal file
62
sky/leann-build.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
name: leann-build
|
||||
|
||||
resources:
|
||||
# Choose a GPU for fast embeddings (examples: L4, A10G, A100). CPU also works but is slower.
|
||||
accelerators: L4:1
|
||||
# Optionally pin a cloud, otherwise SkyPilot will auto-select
|
||||
# cloud: aws
|
||||
disk_size: 100
|
||||
|
||||
env:
|
||||
# Build parameters (override with: sky launch -c leann-gpu sky/leann-build.yaml -e key=value)
|
||||
index_name: my-index
|
||||
docs: ./data
|
||||
backend: hnsw # hnsw | diskann
|
||||
complexity: 64
|
||||
graph_degree: 32
|
||||
num_threads: 8
|
||||
# Embedding selection
|
||||
embedding_mode: sentence-transformers # sentence-transformers | openai | mlx | ollama
|
||||
embedding_model: facebook/contriever
|
||||
# Storage/latency knobs
|
||||
recompute: true # true => selective recomputation; false => store full embeddings
|
||||
compact: true # for HNSW only: false when recompute=false
|
||||
# Optional pass-through
|
||||
extra_args: ""
|
||||
|
||||
# Sync local paths to the remote VM. Adjust as needed.
|
||||
file_mounts:
|
||||
# Example: mount your local data directory used for building
|
||||
~/leann-data: ${docs}
|
||||
|
||||
setup: |
|
||||
set -e
|
||||
# Install uv (package manager)
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
|
||||
# Install the LEANN CLI globally on the remote machine
|
||||
uv tool install leann
|
||||
|
||||
run: |
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
# Derive flags from env
|
||||
recompute_flag=""
|
||||
if [ "${recompute}" = "false" ] || [ "${recompute}" = "0" ]; then
|
||||
recompute_flag="--no-recompute"
|
||||
fi
|
||||
compact_flag=""
|
||||
if [ "${compact}" = "false" ] || [ "${compact}" = "0" ]; then
|
||||
compact_flag="--no-compact"
|
||||
fi
|
||||
|
||||
# Build command
|
||||
leann build ${index_name} \
|
||||
--docs ~/leann-data \
|
||||
--backend ${backend} \
|
||||
--complexity ${complexity} \
|
||||
--graph-degree ${graph_degree} \
|
||||
--num-threads ${num_threads} \
|
||||
--embedding-mode ${embedding_mode} \
|
||||
--embedding-model ${embedding_model} \
|
||||
${recompute_flag} ${compact_flag} ${extra_args}
|
||||
Reference in New Issue
Block a user