Compare commits
44 Commits
v0.2.6
...
fix-mac-in
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
79ec7d1aee | ||
|
|
288d3c4e75 | ||
|
|
7e554b2ba2 | ||
|
|
afd48d5901 | ||
|
|
36083dbf0f | ||
|
|
f819dacbb4 | ||
|
|
6762e5b2c4 | ||
|
|
0797008a3f | ||
|
|
b835eb821e | ||
|
|
a0c790f285 | ||
|
|
b7516608ab | ||
|
|
1bd1238db6 | ||
|
|
b5c80edb03 | ||
|
|
430969565e | ||
|
|
578a89d180 | ||
|
|
068fb38bae | ||
|
|
6aa1a97a07 | ||
|
|
3a1cb49e20 | ||
|
|
239e35e2e6 | ||
|
|
2fac0c6fbf | ||
|
|
fe9381fc8b | ||
|
|
037aad0870 | ||
|
|
ded0decd17 | ||
|
|
5094e6800a | ||
|
|
f08132c525 | ||
|
|
6ae9e0f4f9 | ||
|
|
2be39db799 | ||
|
|
756864d058 | ||
|
|
4fc8943ca7 | ||
|
|
1bc4bf06f0 | ||
|
|
9801aa581b | ||
|
|
8d1e04d7a1 | ||
|
|
f009d2add3 | ||
|
|
1dfc2f3737 | ||
|
|
0543d61572 | ||
|
|
abcc1fed31 | ||
|
|
c1832765cd | ||
|
|
4a5db385f0 | ||
|
|
5f5b97fb54 | ||
|
|
754c9aaedd | ||
|
|
1b01725dd1 | ||
|
|
a620c2077a | ||
|
|
e16c369bfb | ||
|
|
368c587c4f |
85
.github/workflows/build-reusable.yml
vendored
85
.github/workflows/build-reusable.yml
vendored
@@ -54,16 +54,26 @@ jobs:
|
||||
python: '3.12'
|
||||
- os: ubuntu-22.04
|
||||
python: '3.13'
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
python: '3.9'
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
python: '3.10'
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
python: '3.11'
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
python: '3.12'
|
||||
- os: macos-latest
|
||||
- os: macos-14
|
||||
python: '3.13'
|
||||
- os: macos-13
|
||||
python: '3.9'
|
||||
- os: macos-13
|
||||
python: '3.10'
|
||||
- os: macos-13
|
||||
python: '3.11'
|
||||
- os: macos-13
|
||||
python: '3.12'
|
||||
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
|
||||
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
@@ -109,48 +119,59 @@ jobs:
|
||||
uv pip install --system delocate
|
||||
fi
|
||||
|
||||
- name: Set macOS environment variables
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
# Use brew --prefix to automatically detect Homebrew installation path
|
||||
HOMEBREW_PREFIX=$(brew --prefix)
|
||||
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
|
||||
|
||||
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
|
||||
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||
|
||||
# Set compiler flags for OpenMP (required for both backends)
|
||||
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
|
||||
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
|
||||
|
||||
- name: Build packages
|
||||
run: |
|
||||
# Build core (platform independent)
|
||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
||||
cd packages/leann-core
|
||||
uv build
|
||||
cd ../..
|
||||
fi
|
||||
cd packages/leann-core
|
||||
uv build
|
||||
cd ../..
|
||||
|
||||
# Build HNSW backend
|
||||
cd packages/leann-backend-hnsw
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
# Use system clang instead of homebrew LLVM for better compatibility
|
||||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||
# Use system clang for better compatibility
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
uv build --wheel --python python
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
else
|
||||
uv build --wheel --python python
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build DiskANN backend
|
||||
cd packages/leann-backend-diskann
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
# Use system clang instead of homebrew LLVM for better compatibility
|
||||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||
# Use system clang for better compatibility
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||
uv build --wheel --python python
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
else
|
||||
uv build --wheel --python python
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build meta package (platform independent)
|
||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
||||
cd packages/leann
|
||||
uv build
|
||||
cd ../..
|
||||
fi
|
||||
cd packages/leann
|
||||
uv build
|
||||
cd ../..
|
||||
|
||||
- name: Repair wheels (Linux)
|
||||
if: runner.os == 'Linux'
|
||||
@@ -199,20 +220,18 @@ jobs:
|
||||
echo "📦 Built packages:"
|
||||
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
||||
|
||||
|
||||
- name: Install built packages for testing
|
||||
run: |
|
||||
# Create a virtual environment
|
||||
uv venv
|
||||
# Create a virtual environment with the correct Python version
|
||||
uv venv --python ${{ matrix.python }}
|
||||
source .venv/bin/activate || source .venv/Scripts/activate
|
||||
|
||||
# Install the built wheels
|
||||
# Use --find-links to let uv choose the correct wheel for the platform
|
||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
||||
uv pip install leann-core --find-links packages/leann-core/dist
|
||||
uv pip install leann --find-links packages/leann/dist
|
||||
fi
|
||||
uv pip install leann-backend-hnsw --find-links packages/leann-backend-hnsw/dist
|
||||
uv pip install leann-backend-diskann --find-links packages/leann-backend-diskann/dist
|
||||
# Install packages using --find-links to prioritize local builds
|
||||
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz
|
||||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl
|
||||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl
|
||||
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz
|
||||
|
||||
# Install test dependencies using extras
|
||||
uv pip install -e ".[test]"
|
||||
|
||||
@@ -3,10 +3,11 @@
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<img src="https://img.shields.io/badge/Python-3.9%2B-blue.svg" alt="Python 3.9+">
|
||||
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
|
||||
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
|
||||
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
|
||||
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
|
||||
<img src="https://img.shields.io/badge/Platform-Linux%20%7C%20macOS-lightgrey" alt="Platform">
|
||||
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue?style=flat-square" alt="MCP Integration">
|
||||
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
|
||||
</p>
|
||||
|
||||
<h2 align="center" tabindex="-1" class="heading-element" dir="auto">
|
||||
@@ -189,7 +190,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
|
||||
--force-rebuild # Force rebuild index even if it exists
|
||||
|
||||
# Embedding Parameters
|
||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, or mlx-community/multilingual-e5-base-mlx
|
||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
|
||||
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
|
||||
|
||||
# LLM Parameters (Text generation models)
|
||||
|
||||
@@ -222,9 +222,15 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores
|
||||
|
||||
3. **Use MLX on Apple Silicon** (optional optimization):
|
||||
```bash
|
||||
--embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx
|
||||
--embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit
|
||||
```
|
||||
MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation
|
||||
|
||||
4. **Use Ollama**
|
||||
```bash
|
||||
--embedding-mode ollama --embedding-model nomic-embed-text
|
||||
```
|
||||
To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
|
||||
### If Search Quality is Poor
|
||||
|
||||
1. **Increase retrieval count**:
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
# packages/leann-backend-diskann/CMakeLists.txt (simplified version)
|
||||
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(leann_backend_diskann_wrapper)
|
||||
|
||||
# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
|
||||
# DiskANN will handle everything itself, including compiling Python bindings
|
||||
add_subdirectory(src/third_party/DiskANN)
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
import psutil
|
||||
@@ -259,7 +259,7 @@ class DiskannSearcher(BaseSearcher):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
batch_recompute: bool = False,
|
||||
dedup_node_dis: bool = False,
|
||||
**kwargs,
|
||||
|
||||
@@ -10,6 +10,7 @@ import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import zmq
|
||||
@@ -32,7 +33,7 @@ if not logger.handlers:
|
||||
|
||||
|
||||
def create_diskann_embedding_server(
|
||||
passages_file: str | None = None,
|
||||
passages_file: Optional[str] = None,
|
||||
zmq_port: int = 5555,
|
||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
|
||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.2.6"
|
||||
dependencies = ["leann-core==0.2.6", "numpy", "protobuf>=3.19.0"]
|
||||
version = "0.2.7"
|
||||
dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]
|
||||
|
||||
[tool.scikit-build]
|
||||
# Key: simplified CMake path
|
||||
@@ -17,3 +17,5 @@ editable.mode = "redirect"
|
||||
cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
# Let CMake find packages via Homebrew prefix
|
||||
cmake.define = {CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}, OpenMP_ROOT = {env = "OpenMP_ROOT"}}
|
||||
|
||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: b2dc4ea2c7...04048bb302
@@ -5,11 +5,20 @@ set(CMAKE_CXX_COMPILER_WORKS 1)
|
||||
|
||||
# Set OpenMP path for macOS
|
||||
if(APPLE)
|
||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
||||
# Detect Homebrew installation path (Apple Silicon vs Intel)
|
||||
if(EXISTS "/opt/homebrew/opt/libomp")
|
||||
set(HOMEBREW_PREFIX "/opt/homebrew")
|
||||
elseif(EXISTS "/usr/local/opt/libomp")
|
||||
set(HOMEBREW_PREFIX "/usr/local")
|
||||
else()
|
||||
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
|
||||
endif()
|
||||
|
||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_C_LIB_NAMES "omp")
|
||||
set(OpenMP_CXX_LIB_NAMES "omp")
|
||||
set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
|
||||
set(OpenMP_omp_LIBRARY "${HOMEBREW_PREFIX}/opt/libomp/lib/libomp.dylib")
|
||||
|
||||
# Force use of system libc++ to avoid version mismatch
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
from leann.interface import (
|
||||
@@ -152,7 +152,7 @@ class HNSWSearcher(BaseSearcher):
|
||||
self,
|
||||
query: np.ndarray,
|
||||
top_k: int,
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
complexity: int = 64,
|
||||
beam_width: int = 1,
|
||||
prune_ratio: float = 0.0,
|
||||
|
||||
@@ -10,6 +10,7 @@ import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import msgpack
|
||||
import numpy as np
|
||||
@@ -33,7 +34,7 @@ if not logger.handlers:
|
||||
|
||||
|
||||
def create_hnsw_embedding_server(
|
||||
passages_file: str | None = None,
|
||||
passages_file: Union[str, None] = None,
|
||||
zmq_port: int = 5555,
|
||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||
distance_metric: str = "mips",
|
||||
|
||||
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.2.6"
|
||||
version = "0.2.7"
|
||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||
dependencies = [
|
||||
"leann-core==0.2.6",
|
||||
"leann-core==0.2.7",
|
||||
"numpy",
|
||||
"pyzmq>=23.0.0",
|
||||
"msgpack>=1.0.0",
|
||||
@@ -22,6 +22,8 @@ cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
|
||||
# CMake definitions to optimize compilation
|
||||
# CMake definitions to optimize compilation and find Homebrew packages
|
||||
[tool.scikit-build.cmake.define]
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}
|
||||
OpenMP_ROOT = {env = "OpenMP_ROOT"}
|
||||
|
||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: ff22e2c86b...4a2c0d67d3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann-core"
|
||||
version = "0.2.6"
|
||||
version = "0.2.7"
|
||||
description = "Core API and plugin system for LEANN"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
@@ -31,8 +31,10 @@ dependencies = [
|
||||
"PyPDF2>=3.0.0",
|
||||
"pymupdf>=1.23.0",
|
||||
"pdfplumber>=0.10.0",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
"nbconvert>=7.0.0", # For .ipynb file support
|
||||
"gitignore-parser>=0.1.12", # For proper .gitignore handling
|
||||
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -10,7 +10,7 @@ import time
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -33,7 +33,7 @@ def compute_embeddings(
|
||||
model_name: str,
|
||||
mode: str = "sentence-transformers",
|
||||
use_server: bool = True,
|
||||
port: int | None = None,
|
||||
port: Optional[int] = None,
|
||||
is_build=False,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
@@ -157,12 +157,12 @@ class LeannBuilder:
|
||||
self,
|
||||
backend_name: str,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
dimensions: int | None = None,
|
||||
dimensions: Optional[int] = None,
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
**backend_kwargs,
|
||||
):
|
||||
self.backend_name = backend_name
|
||||
backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
|
||||
backend_factory: Optional[LeannBackendFactoryInterface] = BACKEND_REGISTRY.get(backend_name)
|
||||
if backend_factory is None:
|
||||
raise ValueError(f"Backend '{backend_name}' not found or not registered.")
|
||||
self.backend_factory = backend_factory
|
||||
@@ -242,7 +242,7 @@ class LeannBuilder:
|
||||
self.backend_kwargs = backend_kwargs
|
||||
self.chunks: list[dict[str, Any]] = []
|
||||
|
||||
def add_text(self, text: str, metadata: dict[str, Any] | None = None):
|
||||
def add_text(self, text: str, metadata: Optional[dict[str, Any]] = None):
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
passage_id = metadata.get("id", str(len(self.chunks)))
|
||||
@@ -554,7 +554,7 @@ class LeannSearcher:
|
||||
if "labels" in results and "distances" in results:
|
||||
logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
|
||||
for i, (string_id, dist) in enumerate(
|
||||
zip(results["labels"][0], results["distances"][0], strict=False)
|
||||
zip(results["labels"][0], results["distances"][0])
|
||||
):
|
||||
try:
|
||||
passage_data = self.passage_manager.get_passage(string_id)
|
||||
@@ -592,7 +592,7 @@ class LeannChat:
|
||||
def __init__(
|
||||
self,
|
||||
index_path: str,
|
||||
llm_config: dict[str, Any] | None = None,
|
||||
llm_config: Optional[dict[str, Any]] = None,
|
||||
enable_warmup: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -608,7 +608,7 @@ class LeannChat:
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = True,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
llm_kwargs: dict[str, Any] | None = None,
|
||||
llm_kwargs: Optional[dict[str, Any]] = None,
|
||||
expected_zmq_port: int = 5557,
|
||||
**search_kwargs,
|
||||
):
|
||||
|
||||
@@ -8,7 +8,7 @@ import difflib
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
|
||||
@@ -311,7 +311,7 @@ def search_hf_models(query: str, limit: int = 10) -> list[str]:
|
||||
|
||||
def validate_model_and_suggest(
|
||||
model_name: str, llm_type: str, host: str = "http://localhost:11434"
|
||||
) -> str | None:
|
||||
) -> Optional[str]:
|
||||
"""Validate model name and provide suggestions if invalid"""
|
||||
if llm_type == "ollama":
|
||||
available_models = check_ollama_models(host)
|
||||
@@ -685,7 +685,7 @@ class HFChat(LLMInterface):
|
||||
class OpenAIChat(LLMInterface):
|
||||
"""LLM interface for OpenAI models."""
|
||||
|
||||
def __init__(self, model: str = "gpt-4o", api_key: str | None = None):
|
||||
def __init__(self, model: str = "gpt-4o", api_key: Optional[str] = None):
|
||||
self.model = model
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
|
||||
@@ -761,7 +761,7 @@ class SimulatedChat(LLMInterface):
|
||||
return "This is a simulated answer from the LLM based on the retrieved context."
|
||||
|
||||
|
||||
def get_llm(llm_config: dict[str, Any] | None = None) -> LLMInterface:
|
||||
def get_llm(llm_config: Optional[dict[str, Any]] = None) -> LLMInterface:
|
||||
"""
|
||||
Factory function to get an LLM interface based on configuration.
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
@@ -203,62 +204,36 @@ Examples:
|
||||
with open(global_registry, "w") as f:
|
||||
json.dump(projects, f, indent=2)
|
||||
|
||||
def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
|
||||
"""Read .gitignore file and return patterns for exclusion."""
|
||||
gitignore_path = Path(docs_dir) / ".gitignore"
|
||||
patterns = []
|
||||
def _build_gitignore_parser(self, docs_dir: str):
|
||||
"""Build gitignore parser using gitignore-parser library."""
|
||||
from gitignore_parser import parse_gitignore
|
||||
|
||||
# Add some essential patterns that should always be excluded
|
||||
essential_patterns = [
|
||||
".git",
|
||||
".DS_Store",
|
||||
]
|
||||
patterns.extend(essential_patterns)
|
||||
# Try to parse the root .gitignore
|
||||
gitignore_path = Path(docs_dir) / ".gitignore"
|
||||
|
||||
if gitignore_path.exists():
|
||||
try:
|
||||
with open(gitignore_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
# Skip empty lines and comments
|
||||
if line and not line.startswith("#"):
|
||||
# Remove leading slash if present (make it relative)
|
||||
if line.startswith("/"):
|
||||
line = line[1:]
|
||||
patterns.append(line)
|
||||
print(
|
||||
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
|
||||
)
|
||||
# gitignore-parser automatically handles all subdirectory .gitignore files!
|
||||
matches = parse_gitignore(str(gitignore_path))
|
||||
print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)")
|
||||
return matches
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read .gitignore: {e}")
|
||||
print(f"Warning: Could not parse .gitignore: {e}")
|
||||
else:
|
||||
print("📋 No .gitignore found, using minimal exclusion patterns")
|
||||
print("📋 No .gitignore found")
|
||||
|
||||
return patterns
|
||||
# Fallback: basic pattern matching for essential files
|
||||
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
|
||||
|
||||
def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
|
||||
"""Check if a file should be excluded based on gitignore-style patterns."""
|
||||
path_str = str(relative_path)
|
||||
def basic_matches(file_path):
|
||||
path_parts = Path(file_path).parts
|
||||
return any(part in essential_patterns for part in path_parts)
|
||||
|
||||
for pattern in exclude_patterns:
|
||||
# Simple pattern matching (could be enhanced with full gitignore syntax)
|
||||
if pattern.endswith("*"):
|
||||
# Wildcard pattern
|
||||
prefix = pattern[:-1]
|
||||
if path_str.startswith(prefix):
|
||||
return True
|
||||
elif "*" in pattern:
|
||||
# Contains wildcard - simple glob-like matching
|
||||
import fnmatch
|
||||
return basic_matches
|
||||
|
||||
if fnmatch.fnmatch(path_str, pattern):
|
||||
return True
|
||||
else:
|
||||
# Exact match or directory match
|
||||
if path_str == pattern or path_str.startswith(pattern + "/"):
|
||||
return True
|
||||
|
||||
return False
|
||||
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool:
|
||||
"""Check if a file should be excluded using gitignore parser."""
|
||||
return gitignore_matches(str(relative_path))
|
||||
|
||||
def list_indexes(self):
|
||||
print("Stored LEANN indexes:")
|
||||
@@ -336,13 +311,13 @@ Examples:
|
||||
print(f' leann search {example_name} "your query"')
|
||||
print(f" leann ask {example_name} --interactive")
|
||||
|
||||
def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
|
||||
def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None):
|
||||
print(f"Loading documents from {docs_dir}...")
|
||||
if custom_file_types:
|
||||
print(f"Using custom file types: {custom_file_types}")
|
||||
|
||||
# Read .gitignore patterns first
|
||||
exclude_patterns = self._read_gitignore_patterns(docs_dir)
|
||||
# Build gitignore parser
|
||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
@@ -355,7 +330,7 @@ Examples:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, exclude_patterns):
|
||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||
continue
|
||||
|
||||
print(f"Processing PDF: {file_path}")
|
||||
@@ -449,14 +424,34 @@ Examples:
|
||||
]
|
||||
# Try to load other file types, but don't fail if none are found
|
||||
try:
|
||||
# Create a custom file filter function using our PathSpec
|
||||
def file_filter(file_path: str) -> bool:
|
||||
"""Return True if file should be included (not excluded)"""
|
||||
try:
|
||||
docs_path_obj = Path(docs_dir)
|
||||
file_path_obj = Path(file_path)
|
||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||
except (ValueError, OSError):
|
||||
return True # Include files that can't be processed
|
||||
|
||||
other_docs = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
exclude=exclude_patterns,
|
||||
file_extractor={}, # Use default extractors
|
||||
filename_as_id=True,
|
||||
).load_data(show_progress=True)
|
||||
documents.extend(other_docs)
|
||||
|
||||
# Filter documents after loading based on gitignore rules
|
||||
filtered_docs = []
|
||||
for doc in other_docs:
|
||||
file_path = doc.metadata.get("file_path", "")
|
||||
if file_filter(file_path):
|
||||
filtered_docs.append(doc)
|
||||
|
||||
documents.extend(filtered_docs)
|
||||
except ValueError as e:
|
||||
if "No files found" in str(e):
|
||||
print("No additional files found for other supported types.")
|
||||
|
||||
@@ -6,6 +6,7 @@ import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import psutil
|
||||
|
||||
@@ -182,8 +183,8 @@ class EmbeddingServerManager:
|
||||
e.g., "leann_backend_diskann.embedding_server"
|
||||
"""
|
||||
self.backend_module_name = backend_module_name
|
||||
self.server_process: subprocess.Popen | None = None
|
||||
self.server_port: int | None = None
|
||||
self.server_process: Optional[subprocess.Popen] = None
|
||||
self.server_port: Optional[int] = None
|
||||
self._atexit_registered = False
|
||||
|
||||
def start_server(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Literal
|
||||
from typing import Any, Literal, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -34,7 +34,9 @@ class LeannBackendSearcherInterface(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _ensure_server_running(self, passages_source_file: str, port: int | None, **kwargs) -> int:
|
||||
def _ensure_server_running(
|
||||
self, passages_source_file: str, port: Union[int, None], **kwargs
|
||||
) -> int:
|
||||
"""Ensure server is running"""
|
||||
pass
|
||||
|
||||
@@ -48,7 +50,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Union[int, None] = None,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
"""Search for nearest neighbors
|
||||
@@ -74,7 +76,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
self,
|
||||
query: str,
|
||||
use_server_if_available: bool = True,
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Union[int, None] = None,
|
||||
) -> np.ndarray:
|
||||
"""Compute embedding for a query string
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -169,7 +169,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann"
|
||||
version = "0.2.6"
|
||||
version = "0.2.7"
|
||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
@@ -32,7 +32,7 @@ dependencies = [
|
||||
"pypdfium2>=4.30.0",
|
||||
# LlamaIndex core and readers - updated versions
|
||||
"llama-index>=0.12.44",
|
||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||
# "llama-index-readers-docling", # Requires Python >= 3.10
|
||||
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
||||
"llama-index-vector-stores-faiss>=0.4.0",
|
||||
@@ -40,9 +40,12 @@ dependencies = [
|
||||
# Other dependencies
|
||||
"ipykernel==6.29.5",
|
||||
"msgpack>=1.1.1",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"psutil>=5.8.0",
|
||||
"pathspec>=0.12.1",
|
||||
"nbconvert>=7.16.6",
|
||||
"gitignore-parser>=0.1.12",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -88,7 +91,7 @@ leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = tr
|
||||
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
target-version = "py39"
|
||||
line-length = 100
|
||||
extend-exclude = [
|
||||
"third_party",
|
||||
|
||||
Reference in New Issue
Block a user