Compare commits

..

8 Commits

Author SHA1 Message Date
Andy Lee
38ec6aae11 improve CLI with auto project name and .gitignore support
- Make index_name optional, auto-use current directory name
- Read .gitignore patterns and respect them during indexing
- Add _read_gitignore_patterns() to parse .gitignore files
- Add _should_exclude_file() for pattern matching
- Apply exclusion patterns to both PDF and general file processing
- Show helpful messages about gitignore usage

Now users can simply run: leann build
And it will use project name + respect .gitignore patterns.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-09 19:38:38 -07:00
Andy Lee
1e5d05e36a remove leann_index from MCP interface
Users should use CLI command 'leann build' to create indexes first.
MCP now only provides search functionality:
- leann_search: search existing indexes
- leann_status: check index health
- leann_list: list available indexes

This separates index creation (CLI) from search (Claude Code).

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-09 19:28:40 -07:00
Andy Lee
5d21f5bd9d simplify MCP interface for Claude Code
- Remove unnecessary search parameters: search_mode, recompute_embeddings, file_types, min_score
- Remove leann_clear tool (not needed for Claude Code workflow)
- Streamline search to only use: query, index_name, top_k, complexity
- Keep core tools: leann_index, leann_search, leann_status, leann_list

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-09 19:01:39 -07:00
Andy Lee
42690cb74e docs: remove ollama embedding extra instructions 2025-08-09 16:46:47 -07:00
Andy Lee
a2a5b0db1b Merge branch 'main' into feat/claude-code-refine 2025-08-09 00:39:11 -07:00
Andy Lee
67c5a3e838 fix: remove leann_ask 2025-08-09 00:28:25 -07:00
Andy Lee
1071479c05 docs: Add clear documentation for Ollama embedding usage 2025-08-08 18:09:06 -07:00
Andy Lee
068fcd71cf feat: Add Ollama embedding support for local embedding models 2025-08-08 18:07:37 -07:00
23 changed files with 3625 additions and 4013 deletions

View File

@@ -54,26 +54,16 @@ jobs:
python: '3.12' python: '3.12'
- os: ubuntu-22.04 - os: ubuntu-22.04
python: '3.13' python: '3.13'
- os: macos-14 - os: macos-latest
python: '3.9' python: '3.9'
- os: macos-14 - os: macos-latest
python: '3.10' python: '3.10'
- os: macos-14 - os: macos-latest
python: '3.11' python: '3.11'
- os: macos-14 - os: macos-latest
python: '3.12' python: '3.12'
- os: macos-14 - os: macos-latest
python: '3.13' python: '3.13'
- os: macos-13
python: '3.9'
- os: macos-13
python: '3.10'
- os: macos-13
python: '3.11'
- os: macos-13
python: '3.12'
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
@@ -119,59 +109,48 @@ jobs:
uv pip install --system delocate uv pip install --system delocate
fi fi
- name: Set macOS environment variables
if: runner.os == 'macOS'
run: |
# Use brew --prefix to automatically detect Homebrew installation path
HOMEBREW_PREFIX=$(brew --prefix)
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
# Set compiler flags for OpenMP (required for both backends)
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
- name: Build packages - name: Build packages
run: | run: |
# Build core (platform independent) # Build core (platform independent)
cd packages/leann-core if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
uv build cd packages/leann-core
cd ../.. uv build
cd ../..
fi
# Build HNSW backend # Build HNSW backend
cd packages/leann-backend-hnsw cd packages/leann-backend-hnsw
if [[ "${{ matrix.os }}" == macos-* ]]; then if [ "${{ matrix.os }}" == "macos-latest" ]; then
# Use system clang for better compatibility # Use system clang instead of homebrew LLVM for better compatibility
export CC=clang export CC=clang
export CXX=clang++ export CXX=clang++
export MACOSX_DEPLOYMENT_TARGET=11.0 export MACOSX_DEPLOYMENT_TARGET=11.0
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python python
else else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python python
fi fi
cd ../.. cd ../..
# Build DiskANN backend # Build DiskANN backend
cd packages/leann-backend-diskann cd packages/leann-backend-diskann
if [[ "${{ matrix.os }}" == macos-* ]]; then if [ "${{ matrix.os }}" == "macos-latest" ]; then
# Use system clang for better compatibility # Use system clang instead of homebrew LLVM for better compatibility
export CC=clang export CC=clang
export CXX=clang++ export CXX=clang++
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function # DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
export MACOSX_DEPLOYMENT_TARGET=13.3 export MACOSX_DEPLOYMENT_TARGET=13.3
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python python
else else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist uv build --wheel --python python
fi fi
cd ../.. cd ../..
# Build meta package (platform independent) # Build meta package (platform independent)
cd packages/leann if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
uv build cd packages/leann
cd ../.. uv build
cd ../..
fi
- name: Repair wheels (Linux) - name: Repair wheels (Linux)
if: runner.os == 'Linux' if: runner.os == 'Linux'
@@ -220,18 +199,20 @@ jobs:
echo "📦 Built packages:" echo "📦 Built packages:"
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
- name: Install built packages for testing - name: Install built packages for testing
run: | run: |
# Create a virtual environment with the correct Python version # Create a virtual environment
uv venv --python ${{ matrix.python }} uv venv
source .venv/bin/activate || source .venv/Scripts/activate source .venv/bin/activate || source .venv/Scripts/activate
# Install packages using --find-links to prioritize local builds # Install the built wheels
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz # Use --find-links to let uv choose the correct wheel for the platform
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl uv pip install leann-core --find-links packages/leann-core/dist
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz uv pip install leann --find-links packages/leann/dist
fi
uv pip install leann-backend-hnsw --find-links packages/leann-backend-hnsw/dist
uv pip install leann-backend-diskann --find-links packages/leann-backend-diskann/dist
# Install test dependencies using extras # Install test dependencies using extras
uv pip install -e ".[test]" uv pip install -e ".[test]"

View File

@@ -3,11 +3,10 @@
</p> </p>
<p align="center"> <p align="center">
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions"> <img src="https://img.shields.io/badge/Python-3.9%2B-blue.svg" alt="Python 3.9+">
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License"> <img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration"> <img src="https://img.shields.io/badge/Platform-Linux%20%7C%20macOS-lightgrey" alt="Platform">
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue?style=flat-square" alt="MCP Integration">
</p> </p>
<h2 align="center" tabindex="-1" class="heading-element" dir="auto"> <h2 align="center" tabindex="-1" class="heading-element" dir="auto">
@@ -190,7 +189,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
--force-rebuild # Force rebuild index even if it exists --force-rebuild # Force rebuild index even if it exists
# Embedding Parameters # Embedding Parameters
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text --embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, or mlx-community/multilingual-e5-base-mlx
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama --embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
# LLM Parameters (Text generation models) # LLM Parameters (Text generation models)

View File

@@ -222,15 +222,9 @@ python apps/document_rag.py --query "What are the main techniques LEANN explores
3. **Use MLX on Apple Silicon** (optional optimization): 3. **Use MLX on Apple Silicon** (optional optimization):
```bash ```bash
--embedding-mode mlx --embedding-model mlx-community/Qwen3-Embedding-0.6B-8bit --embedding-mode mlx --embedding-model mlx-community/multilingual-e5-base-mlx
``` ```
MLX might not be the best choice, as we tested and found that it only offers 1.3x acceleration compared to HF, so maybe using ollama is a better choice for embedding generation
4. **Use Ollama**
```bash
--embedding-mode ollama --embedding-model nomic-embed-text
```
To discover additional embedding models in ollama, check out https://ollama.com/search?c=embedding or read more about embedding models at https://ollama.com/blog/embedding-models, please do check the model size that works best for you
### If Search Quality is Poor ### If Search Quality is Poor
1. **Increase retrieval count**: 1. **Increase retrieval count**:

View File

@@ -0,0 +1,8 @@
# packages/leann-backend-diskann/CMakeLists.txt (simplified version)
cmake_minimum_required(VERSION 3.20)
project(leann_backend_diskann_wrapper)
# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
# DiskANN will handle everything itself, including compiling Python bindings
add_subdirectory(src/third_party/DiskANN)

View File

@@ -4,7 +4,7 @@ import os
import struct import struct
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Any, Literal, Optional from typing import Any, Literal
import numpy as np import numpy as np
import psutil import psutil
@@ -259,7 +259,7 @@ class DiskannSearcher(BaseSearcher):
prune_ratio: float = 0.0, prune_ratio: float = 0.0,
recompute_embeddings: bool = False, recompute_embeddings: bool = False,
pruning_strategy: Literal["global", "local", "proportional"] = "global", pruning_strategy: Literal["global", "local", "proportional"] = "global",
zmq_port: Optional[int] = None, zmq_port: int | None = None,
batch_recompute: bool = False, batch_recompute: bool = False,
dedup_node_dis: bool = False, dedup_node_dis: bool = False,
**kwargs, **kwargs,

View File

@@ -10,7 +10,6 @@ import sys
import threading import threading
import time import time
from pathlib import Path from pathlib import Path
from typing import Optional
import numpy as np import numpy as np
import zmq import zmq
@@ -33,7 +32,7 @@ if not logger.handlers:
def create_diskann_embedding_server( def create_diskann_embedding_server(
passages_file: Optional[str] = None, passages_file: str | None = None,
zmq_port: int = 5555, zmq_port: int = 5555,
model_name: str = "sentence-transformers/all-mpnet-base-v2", model_name: str = "sentence-transformers/all-mpnet-base-v2",
embedding_mode: str = "sentence-transformers", embedding_mode: str = "sentence-transformers",

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-diskann" name = "leann-backend-diskann"
version = "0.2.7" version = "0.2.5"
dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"] dependencies = ["leann-core==0.2.5", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build] [tool.scikit-build]
# Key: simplified CMake path # Key: simplified CMake path
@@ -17,5 +17,3 @@ editable.mode = "redirect"
cmake.build-type = "Release" cmake.build-type = "Release"
build.verbose = true build.verbose = true
build.tool-args = ["-j8"] build.tool-args = ["-j8"]
# Let CMake find packages via Homebrew prefix
cmake.define = {CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}, OpenMP_ROOT = {env = "OpenMP_ROOT"}}

View File

@@ -5,20 +5,11 @@ set(CMAKE_CXX_COMPILER_WORKS 1)
# Set OpenMP path for macOS # Set OpenMP path for macOS
if(APPLE) if(APPLE)
# Detect Homebrew installation path (Apple Silicon vs Intel) set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
if(EXISTS "/opt/homebrew/opt/libomp") set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
set(HOMEBREW_PREFIX "/opt/homebrew")
elseif(EXISTS "/usr/local/opt/libomp")
set(HOMEBREW_PREFIX "/usr/local")
else()
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
endif()
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
set(OpenMP_C_LIB_NAMES "omp") set(OpenMP_C_LIB_NAMES "omp")
set(OpenMP_CXX_LIB_NAMES "omp") set(OpenMP_CXX_LIB_NAMES "omp")
set(OpenMP_omp_LIBRARY "${HOMEBREW_PREFIX}/opt/libomp/lib/libomp.dylib") set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
# Force use of system libc++ to avoid version mismatch # Force use of system libc++ to avoid version mismatch
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")

View File

@@ -2,7 +2,7 @@ import logging
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import Any, Literal, Optional from typing import Any, Literal
import numpy as np import numpy as np
from leann.interface import ( from leann.interface import (
@@ -152,7 +152,7 @@ class HNSWSearcher(BaseSearcher):
self, self,
query: np.ndarray, query: np.ndarray,
top_k: int, top_k: int,
zmq_port: Optional[int] = None, zmq_port: int | None = None,
complexity: int = 64, complexity: int = 64,
beam_width: int = 1, beam_width: int = 1,
prune_ratio: float = 0.0, prune_ratio: float = 0.0,

View File

@@ -10,7 +10,6 @@ import sys
import threading import threading
import time import time
from pathlib import Path from pathlib import Path
from typing import Union
import msgpack import msgpack
import numpy as np import numpy as np
@@ -34,7 +33,7 @@ if not logger.handlers:
def create_hnsw_embedding_server( def create_hnsw_embedding_server(
passages_file: Union[str, None] = None, passages_file: str | None = None,
zmq_port: int = 5555, zmq_port: int = 5555,
model_name: str = "sentence-transformers/all-mpnet-base-v2", model_name: str = "sentence-transformers/all-mpnet-base-v2",
distance_metric: str = "mips", distance_metric: str = "mips",

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project] [project]
name = "leann-backend-hnsw" name = "leann-backend-hnsw"
version = "0.2.7" version = "0.2.5"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit." description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [ dependencies = [
"leann-core==0.2.7", "leann-core==0.2.5",
"numpy", "numpy",
"pyzmq>=23.0.0", "pyzmq>=23.0.0",
"msgpack>=1.0.0", "msgpack>=1.0.0",
@@ -22,8 +22,6 @@ cmake.build-type = "Release"
build.verbose = true build.verbose = true
build.tool-args = ["-j8"] build.tool-args = ["-j8"]
# CMake definitions to optimize compilation and find Homebrew packages # CMake definitions to optimize compilation
[tool.scikit-build.cmake.define] [tool.scikit-build.cmake.define]
CMAKE_BUILD_PARALLEL_LEVEL = "8" CMAKE_BUILD_PARALLEL_LEVEL = "8"
CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}
OpenMP_ROOT = {env = "OpenMP_ROOT"}

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann-core" name = "leann-core"
version = "0.2.7" version = "0.2.5"
description = "Core API and plugin system for LEANN" description = "Core API and plugin system for LEANN"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"
@@ -31,10 +31,8 @@ dependencies = [
"PyPDF2>=3.0.0", "PyPDF2>=3.0.0",
"pymupdf>=1.23.0", "pymupdf>=1.23.0",
"pdfplumber>=0.10.0", "pdfplumber>=0.10.0",
"nbconvert>=7.0.0", # For .ipynb file support "mlx>=0.26.3; sys_platform == 'darwin'",
"gitignore-parser>=0.1.12", # For proper .gitignore handling "mlx-lm>=0.26.0; sys_platform == 'darwin'",
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -10,7 +10,7 @@ import time
import warnings import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Any, Literal, Optional from typing import Any, Literal
import numpy as np import numpy as np
@@ -33,7 +33,7 @@ def compute_embeddings(
model_name: str, model_name: str,
mode: str = "sentence-transformers", mode: str = "sentence-transformers",
use_server: bool = True, use_server: bool = True,
port: Optional[int] = None, port: int | None = None,
is_build=False, is_build=False,
) -> np.ndarray: ) -> np.ndarray:
""" """
@@ -157,12 +157,12 @@ class LeannBuilder:
self, self,
backend_name: str, backend_name: str,
embedding_model: str = "facebook/contriever", embedding_model: str = "facebook/contriever",
dimensions: Optional[int] = None, dimensions: int | None = None,
embedding_mode: str = "sentence-transformers", embedding_mode: str = "sentence-transformers",
**backend_kwargs, **backend_kwargs,
): ):
self.backend_name = backend_name self.backend_name = backend_name
backend_factory: Optional[LeannBackendFactoryInterface] = BACKEND_REGISTRY.get(backend_name) backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
if backend_factory is None: if backend_factory is None:
raise ValueError(f"Backend '{backend_name}' not found or not registered.") raise ValueError(f"Backend '{backend_name}' not found or not registered.")
self.backend_factory = backend_factory self.backend_factory = backend_factory
@@ -242,7 +242,7 @@ class LeannBuilder:
self.backend_kwargs = backend_kwargs self.backend_kwargs = backend_kwargs
self.chunks: list[dict[str, Any]] = [] self.chunks: list[dict[str, Any]] = []
def add_text(self, text: str, metadata: Optional[dict[str, Any]] = None): def add_text(self, text: str, metadata: dict[str, Any] | None = None):
if metadata is None: if metadata is None:
metadata = {} metadata = {}
passage_id = metadata.get("id", str(len(self.chunks))) passage_id = metadata.get("id", str(len(self.chunks)))
@@ -554,7 +554,7 @@ class LeannSearcher:
if "labels" in results and "distances" in results: if "labels" in results and "distances" in results:
logger.info(f" Processing {len(results['labels'][0])} passage IDs:") logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
for i, (string_id, dist) in enumerate( for i, (string_id, dist) in enumerate(
zip(results["labels"][0], results["distances"][0]) zip(results["labels"][0], results["distances"][0], strict=False)
): ):
try: try:
passage_data = self.passage_manager.get_passage(string_id) passage_data = self.passage_manager.get_passage(string_id)
@@ -592,7 +592,7 @@ class LeannChat:
def __init__( def __init__(
self, self,
index_path: str, index_path: str,
llm_config: Optional[dict[str, Any]] = None, llm_config: dict[str, Any] | None = None,
enable_warmup: bool = False, enable_warmup: bool = False,
**kwargs, **kwargs,
): ):
@@ -608,7 +608,7 @@ class LeannChat:
prune_ratio: float = 0.0, prune_ratio: float = 0.0,
recompute_embeddings: bool = True, recompute_embeddings: bool = True,
pruning_strategy: Literal["global", "local", "proportional"] = "global", pruning_strategy: Literal["global", "local", "proportional"] = "global",
llm_kwargs: Optional[dict[str, Any]] = None, llm_kwargs: dict[str, Any] | None = None,
expected_zmq_port: int = 5557, expected_zmq_port: int = 5557,
**search_kwargs, **search_kwargs,
): ):

View File

@@ -8,7 +8,7 @@ import difflib
import logging import logging
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Optional from typing import Any
import torch import torch
@@ -311,7 +311,7 @@ def search_hf_models(query: str, limit: int = 10) -> list[str]:
def validate_model_and_suggest( def validate_model_and_suggest(
model_name: str, llm_type: str, host: str = "http://localhost:11434" model_name: str, llm_type: str, host: str = "http://localhost:11434"
) -> Optional[str]: ) -> str | None:
"""Validate model name and provide suggestions if invalid""" """Validate model name and provide suggestions if invalid"""
if llm_type == "ollama": if llm_type == "ollama":
available_models = check_ollama_models(host) available_models = check_ollama_models(host)
@@ -685,7 +685,7 @@ class HFChat(LLMInterface):
class OpenAIChat(LLMInterface): class OpenAIChat(LLMInterface):
"""LLM interface for OpenAI models.""" """LLM interface for OpenAI models."""
def __init__(self, model: str = "gpt-4o", api_key: Optional[str] = None): def __init__(self, model: str = "gpt-4o", api_key: str | None = None):
self.model = model self.model = model
self.api_key = api_key or os.getenv("OPENAI_API_KEY") self.api_key = api_key or os.getenv("OPENAI_API_KEY")
@@ -761,7 +761,7 @@ class SimulatedChat(LLMInterface):
return "This is a simulated answer from the LLM based on the retrieved context." return "This is a simulated answer from the LLM based on the retrieved context."
def get_llm(llm_config: Optional[dict[str, Any]] = None) -> LLMInterface: def get_llm(llm_config: dict[str, Any] | None = None) -> LLMInterface:
""" """
Factory function to get an LLM interface based on configuration. Factory function to get an LLM interface based on configuration.

View File

@@ -1,7 +1,6 @@
import argparse import argparse
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from typing import Union
from llama_index.core import SimpleDirectoryReader from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.node_parser import SentenceSplitter
@@ -204,36 +203,62 @@ Examples:
with open(global_registry, "w") as f: with open(global_registry, "w") as f:
json.dump(projects, f, indent=2) json.dump(projects, f, indent=2)
def _build_gitignore_parser(self, docs_dir: str): def _read_gitignore_patterns(self, docs_dir: str) -> list[str]:
"""Build gitignore parser using gitignore-parser library.""" """Read .gitignore file and return patterns for exclusion."""
from gitignore_parser import parse_gitignore
# Try to parse the root .gitignore
gitignore_path = Path(docs_dir) / ".gitignore" gitignore_path = Path(docs_dir) / ".gitignore"
patterns = []
# Add some essential patterns that should always be excluded
essential_patterns = [
".git",
".DS_Store",
]
patterns.extend(essential_patterns)
if gitignore_path.exists(): if gitignore_path.exists():
try: try:
# gitignore-parser automatically handles all subdirectory .gitignore files! with open(gitignore_path, encoding="utf-8") as f:
matches = parse_gitignore(str(gitignore_path)) for line in f:
print(f"📋 Loaded .gitignore from {docs_dir} (includes all subdirectories)") line = line.strip()
return matches # Skip empty lines and comments
if line and not line.startswith("#"):
# Remove leading slash if present (make it relative)
if line.startswith("/"):
line = line[1:]
patterns.append(line)
print(
f"📋 Loaded {len(patterns) - len(essential_patterns)} patterns from .gitignore"
)
except Exception as e: except Exception as e:
print(f"Warning: Could not parse .gitignore: {e}") print(f"Warning: Could not read .gitignore: {e}")
else: else:
print("📋 No .gitignore found") print("📋 No .gitignore found, using minimal exclusion patterns")
# Fallback: basic pattern matching for essential files return patterns
essential_patterns = {".git", ".DS_Store", "__pycache__", "node_modules", ".venv", "venv"}
def basic_matches(file_path): def _should_exclude_file(self, relative_path: Path, exclude_patterns: list[str]) -> bool:
path_parts = Path(file_path).parts """Check if a file should be excluded based on gitignore-style patterns."""
return any(part in essential_patterns for part in path_parts) path_str = str(relative_path)
return basic_matches for pattern in exclude_patterns:
# Simple pattern matching (could be enhanced with full gitignore syntax)
if pattern.endswith("*"):
# Wildcard pattern
prefix = pattern[:-1]
if path_str.startswith(prefix):
return True
elif "*" in pattern:
# Contains wildcard - simple glob-like matching
import fnmatch
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool: if fnmatch.fnmatch(path_str, pattern):
"""Check if a file should be excluded using gitignore parser.""" return True
return gitignore_matches(str(relative_path)) else:
# Exact match or directory match
if path_str == pattern or path_str.startswith(pattern + "/"):
return True
return False
def list_indexes(self): def list_indexes(self):
print("Stored LEANN indexes:") print("Stored LEANN indexes:")
@@ -311,13 +336,13 @@ Examples:
print(f' leann search {example_name} "your query"') print(f' leann search {example_name} "your query"')
print(f" leann ask {example_name} --interactive") print(f" leann ask {example_name} --interactive")
def load_documents(self, docs_dir: str, custom_file_types: Union[str, None] = None): def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
print(f"Loading documents from {docs_dir}...") print(f"Loading documents from {docs_dir}...")
if custom_file_types: if custom_file_types:
print(f"Using custom file types: {custom_file_types}") print(f"Using custom file types: {custom_file_types}")
# Build gitignore parser # Read .gitignore patterns first
gitignore_matches = self._build_gitignore_parser(docs_dir) exclude_patterns = self._read_gitignore_patterns(docs_dir)
# Try to use better PDF parsers first, but only if PDFs are requested # Try to use better PDF parsers first, but only if PDFs are requested
documents = [] documents = []
@@ -330,7 +355,7 @@ Examples:
for file_path in docs_path.rglob("*.pdf"): for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern # Check if file matches any exclude pattern
relative_path = file_path.relative_to(docs_path) relative_path = file_path.relative_to(docs_path)
if self._should_exclude_file(relative_path, gitignore_matches): if self._should_exclude_file(relative_path, exclude_patterns):
continue continue
print(f"Processing PDF: {file_path}") print(f"Processing PDF: {file_path}")
@@ -424,34 +449,14 @@ Examples:
] ]
# Try to load other file types, but don't fail if none are found # Try to load other file types, but don't fail if none are found
try: try:
# Create a custom file filter function using our PathSpec
def file_filter(file_path: str) -> bool:
"""Return True if file should be included (not excluded)"""
try:
docs_path_obj = Path(docs_dir)
file_path_obj = Path(file_path)
relative_path = file_path_obj.relative_to(docs_path_obj)
return not self._should_exclude_file(relative_path, gitignore_matches)
except (ValueError, OSError):
return True # Include files that can't be processed
other_docs = SimpleDirectoryReader( other_docs = SimpleDirectoryReader(
docs_dir, docs_dir,
recursive=True, recursive=True,
encoding="utf-8", encoding="utf-8",
required_exts=code_extensions, required_exts=code_extensions,
file_extractor={}, # Use default extractors exclude=exclude_patterns,
filename_as_id=True,
).load_data(show_progress=True) ).load_data(show_progress=True)
documents.extend(other_docs)
# Filter documents after loading based on gitignore rules
filtered_docs = []
for doc in other_docs:
file_path = doc.metadata.get("file_path", "")
if file_filter(file_path):
filtered_docs.append(doc)
documents.extend(filtered_docs)
except ValueError as e: except ValueError as e:
if "No files found" in str(e): if "No files found" in str(e):
print("No additional files found for other supported types.") print("No additional files found for other supported types.")

View File

@@ -6,7 +6,6 @@ import subprocess
import sys import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Optional
import psutil import psutil
@@ -183,8 +182,8 @@ class EmbeddingServerManager:
e.g., "leann_backend_diskann.embedding_server" e.g., "leann_backend_diskann.embedding_server"
""" """
self.backend_module_name = backend_module_name self.backend_module_name = backend_module_name
self.server_process: Optional[subprocess.Popen] = None self.server_process: subprocess.Popen | None = None
self.server_port: Optional[int] = None self.server_port: int | None = None
self._atexit_registered = False self._atexit_registered = False
def start_server( def start_server(

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Literal, Union from typing import Any, Literal
import numpy as np import numpy as np
@@ -34,9 +34,7 @@ class LeannBackendSearcherInterface(ABC):
pass pass
@abstractmethod @abstractmethod
def _ensure_server_running( def _ensure_server_running(self, passages_source_file: str, port: int | None, **kwargs) -> int:
self, passages_source_file: str, port: Union[int, None], **kwargs
) -> int:
"""Ensure server is running""" """Ensure server is running"""
pass pass
@@ -50,7 +48,7 @@ class LeannBackendSearcherInterface(ABC):
prune_ratio: float = 0.0, prune_ratio: float = 0.0,
recompute_embeddings: bool = False, recompute_embeddings: bool = False,
pruning_strategy: Literal["global", "local", "proportional"] = "global", pruning_strategy: Literal["global", "local", "proportional"] = "global",
zmq_port: Union[int, None] = None, zmq_port: int | None = None,
**kwargs, **kwargs,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Search for nearest neighbors """Search for nearest neighbors
@@ -76,7 +74,7 @@ class LeannBackendSearcherInterface(ABC):
self, self,
query: str, query: str,
use_server_if_available: bool = True, use_server_if_available: bool = True,
zmq_port: Union[int, None] = None, zmq_port: int | None = None,
) -> np.ndarray: ) -> np.ndarray:
"""Compute embedding for a query string """Compute embedding for a query string

View File

@@ -1,7 +1,7 @@
import json import json
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Any, Literal, Optional from typing import Any, Literal
import numpy as np import numpy as np
@@ -169,7 +169,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
prune_ratio: float = 0.0, prune_ratio: float = 0.0,
recompute_embeddings: bool = False, recompute_embeddings: bool = False,
pruning_strategy: Literal["global", "local", "proportional"] = "global", pruning_strategy: Literal["global", "local", "proportional"] = "global",
zmq_port: Optional[int] = None, zmq_port: int | None = None,
**kwargs, **kwargs,
) -> dict[str, Any]: ) -> dict[str, Any]:
""" """

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "leann" name = "leann"
version = "0.2.7" version = "0.2.5"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!" description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md" readme = "README.md"
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@@ -32,7 +32,7 @@ dependencies = [
"pypdfium2>=4.30.0", "pypdfium2>=4.30.0",
# LlamaIndex core and readers - updated versions # LlamaIndex core and readers - updated versions
"llama-index>=0.12.44", "llama-index>=0.12.44",
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing "llama-index-readers-file>=0.4.0", # Essential for PDF parsing
# "llama-index-readers-docling", # Requires Python >= 3.10 # "llama-index-readers-docling", # Requires Python >= 3.10
# "llama-index-node-parser-docling", # Requires Python >= 3.10 # "llama-index-node-parser-docling", # Requires Python >= 3.10
"llama-index-vector-stores-faiss>=0.4.0", "llama-index-vector-stores-faiss>=0.4.0",
@@ -40,12 +40,9 @@ dependencies = [
# Other dependencies # Other dependencies
"ipykernel==6.29.5", "ipykernel==6.29.5",
"msgpack>=1.1.1", "msgpack>=1.1.1",
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'", "mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'", "mlx-lm>=0.26.0; sys_platform == 'darwin'",
"psutil>=5.8.0", "psutil>=5.8.0",
"pathspec>=0.12.1",
"nbconvert>=7.16.6",
"gitignore-parser>=0.1.12",
] ]
[project.optional-dependencies] [project.optional-dependencies]
@@ -91,7 +88,7 @@ leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = tr
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true } leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
[tool.ruff] [tool.ruff]
target-version = "py39" target-version = "py310"
line-length = 100 line-length = 100
extend-exclude = [ extend-exclude = [
"third_party", "third_party",

7318
uv.lock generated
View File

File diff suppressed because it is too large Load Diff