Compare commits

..

3 Commits

Author SHA1 Message Date
yichuan520030910320
ae29ae9b88 fix: use proper conditional expression for token fallback
- Use conditional expression to check if GH_PAT exists before using it
- Fallback to GITHUB_TOKEN if GH_PAT is not set or empty
- This fixes the 'token not supplied' error
2025-12-24 12:16:38 +08:00
yichuan520030910320
e5977e4c4f fix: add fallback to GITHUB_TOKEN for checkout token
- Use GH_PAT if available, otherwise fallback to GITHUB_TOKEN
- This prevents 'token not supplied' errors when GH_PAT is not configured
2025-12-24 12:16:15 +08:00
yichuan520030910320
cbd6c8ab34 feat: add GH_PAT token and fetch-depth to GitHub Actions workflows
- Add fetch-depth: 1 to all checkout actions for faster checkout
- Add token: ${{ secrets.GH_PAT }} to all checkout actions
- This enables access to private submodules and improves checkout performance
2025-12-24 12:14:51 +08:00
37 changed files with 1287 additions and 390 deletions

View File

@@ -16,8 +16,10 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
fetch-depth: 1
token: ${{ secrets.GH_PAT != '' && secrets.GH_PAT || secrets.GITHUB_TOKEN }}
ref: ${{ inputs.ref }}
- name: Install uv and Python
uses: astral-sh/setup-uv@v6
@@ -28,36 +30,15 @@ jobs:
run: |
uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
type-check:
name: Type Check with ty
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Install uv and Python
uses: astral-sh/setup-uv@v6
with:
python-version: '3.11'
- name: Install ty
run: uv tool install ty
- name: Run ty type checker
run: |
# Run ty on core packages, apps, and tests
ty check packages/leann-core/src apps tests
build:
needs: [lint, type-check]
needs: lint
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy:
matrix:
include:
# Note: Python 3.9 dropped - uses PEP 604 union syntax (str | None)
# which requires Python 3.10+
- os: ubuntu-22.04
python: '3.9'
- os: ubuntu-22.04
python: '3.10'
- os: ubuntu-22.04
@@ -67,6 +48,8 @@ jobs:
- os: ubuntu-22.04
python: '3.13'
# ARM64 Linux builds
- os: ubuntu-24.04-arm
python: '3.9'
- os: ubuntu-24.04-arm
python: '3.10'
- os: ubuntu-24.04-arm
@@ -75,6 +58,8 @@ jobs:
python: '3.12'
- os: ubuntu-24.04-arm
python: '3.13'
- os: macos-14
python: '3.9'
- os: macos-14
python: '3.10'
- os: macos-14
@@ -83,6 +68,8 @@ jobs:
python: '3.12'
- os: macos-14
python: '3.13'
- os: macos-15
python: '3.9'
- os: macos-15
python: '3.10'
- os: macos-15
@@ -91,31 +78,25 @@ jobs:
python: '3.12'
- os: macos-15
python: '3.13'
# Intel Mac builds (x86_64) - replaces deprecated macos-13
# Note: Python 3.13 excluded - PyTorch has no wheels for macOS x86_64 + Python 3.13
# (PyTorch <=2.4.1 lacks cp313, PyTorch >=2.5.0 dropped Intel Mac support)
- os: macos-15-intel
- os: macos-13
python: '3.9'
- os: macos-13
python: '3.10'
- os: macos-15-intel
- os: macos-13
python: '3.11'
- os: macos-15-intel
- os: macos-13
python: '3.12'
# macOS 26 (beta) - arm64
- os: macos-26
python: '3.10'
- os: macos-26
python: '3.11'
- os: macos-26
python: '3.12'
- os: macos-26
python: '3.13'
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v5
with:
ref: ${{ inputs.ref }}
submodules: recursive
fetch-depth: 1
token: ${{ secrets.GH_PAT != '' && secrets.GH_PAT || secrets.GITHUB_TOKEN }}
ref: ${{ inputs.ref }}
- name: Install uv and Python
uses: astral-sh/setup-uv@v6
@@ -227,16 +208,13 @@ jobs:
# Use system clang for better compatibility
export CC=clang
export CXX=clang++
# Set deployment target based on runner
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
# Homebrew libraries on each macOS version require matching minimum version
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export MACOSX_DEPLOYMENT_TARGET=13.0
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
export MACOSX_DEPLOYMENT_TARGET=14.0
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
export MACOSX_DEPLOYMENT_TARGET=26.0
fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
@@ -250,16 +228,14 @@ jobs:
# Use system clang for better compatibility
export CC=clang
export CXX=clang++
# Set deployment target based on runner
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
# But Homebrew libraries on each macOS version require matching minimum version
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export MACOSX_DEPLOYMENT_TARGET=13.3
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
export MACOSX_DEPLOYMENT_TARGET=14.0
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
export MACOSX_DEPLOYMENT_TARGET=26.0
fi
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
@@ -297,19 +273,16 @@ jobs:
if: runner.os == 'macOS'
run: |
# Determine deployment target based on runner OS
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
HNSW_TARGET="15.0"
DISKANN_TARGET="15.0"
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
# Must match the Homebrew libraries for each macOS version
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
HNSW_TARGET="13.0"
DISKANN_TARGET="13.3"
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
HNSW_TARGET="14.0"
DISKANN_TARGET="14.0"
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
HNSW_TARGET="15.0"
DISKANN_TARGET="15.0"
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
HNSW_TARGET="26.0"
DISKANN_TARGET="26.0"
fi
# Repair HNSW wheel
@@ -365,15 +338,12 @@ jobs:
PY_TAG=$($UV_PY -c "import sys; print(f'cp{sys.version_info[0]}{sys.version_info[1]}')")
if [[ "$RUNNER_OS" == "macOS" ]]; then
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export MACOSX_DEPLOYMENT_TARGET=13.3
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
export MACOSX_DEPLOYMENT_TARGET=14.0
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
export MACOSX_DEPLOYMENT_TARGET=15.0
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
export MACOSX_DEPLOYMENT_TARGET=26.0
fi
fi

View File

@@ -12,6 +12,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
token: ${{ secrets.GH_PAT != '' && secrets.GH_PAT || secrets.GITHUB_TOKEN }}
- uses: lycheeverse/lychee-action@v2
with:
args: --no-progress --insecure --user-agent 'curl/7.68.0' --exclude '.*api\.star-history\.com.*' --accept 200,201,202,203,204,205,206,207,208,226,300,301,302,303,304,305,306,307,308,503 README.md docs/ apps/ examples/ benchmarks/

View File

@@ -19,6 +19,9 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
token: ${{ secrets.GH_PAT != '' && secrets.GH_PAT || secrets.GITHUB_TOKEN }}
- name: Validate version
run: |
@@ -73,6 +76,8 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
token: ${{ secrets.GH_PAT != '' && secrets.GH_PAT || secrets.GITHUB_TOKEN }}
ref: 'main'
- name: Download all artifacts

View File

@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
import argparse
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any
from typing import Any, Union
import dotenv
from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
pass
@abstractmethod
async def load_data(self, args) -> list[dict[str, Any]]:
"""Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
pass
def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
return config
async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
"""Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
print(f"\n[Building Index] Creating {self.name} index...")

View File

@@ -6,7 +6,6 @@ Supports Chrome browser history.
import os
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -86,7 +85,7 @@ class BrowserRAG(BaseRAGExample):
return profiles
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load browser history and convert to text chunks."""
# Determine Chrome profiles
if args.chrome_profile and not args.auto_find_profiles:

View File

@@ -5,7 +5,6 @@ Supports ChatGPT export data from chat.html files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -81,7 +80,7 @@ class ChatGPTRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load ChatGPT export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -5,7 +5,6 @@ Supports Claude export data from JSON files.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -81,7 +80,7 @@ class ClaudeRAG(BaseRAGExample):
return export_files
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load Claude export data and convert to text chunks."""
export_path = Path(args.export_path)

View File

@@ -6,7 +6,6 @@ optimized chunking parameters.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -78,7 +77,7 @@ class CodeRAG(BaseRAGExample):
help="Try to preserve import statements in chunks (default: True)",
)
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load code files and convert to AST-aware chunks."""
print(f"🔍 Scanning code repository: {args.repo_dir}")
print(f"📁 Including extensions: {args.include_extensions}")
@@ -89,6 +88,14 @@ class CodeRAG(BaseRAGExample):
if not repo_path.exists():
raise ValueError(f"Repository directory not found: {args.repo_dir}")
# Load code files with filtering
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
"required_exts": args.include_extensions,
"exclude_hidden": True,
}
# Create exclusion filter
def file_filter(file_path: str) -> bool:
"""Filter out unwanted files and directories."""
@@ -113,11 +120,8 @@ class CodeRAG(BaseRAGExample):
# Load documents with file filtering
documents = SimpleDirectoryReader(
args.repo_dir,
file_extractor=None,
recursive=True,
encoding="utf-8",
required_exts=args.include_extensions,
exclude_hidden=True,
file_extractor=None, # Use default extractors
**reader_kwargs,
).load_data(show_progress=True)
# Apply custom filtering

View File

@@ -5,7 +5,7 @@ Supports PDF, TXT, MD, and other document formats.
import sys
from pathlib import Path
from typing import Any
from typing import Any, Union
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ class DocumentRAG(BaseRAGExample):
help="Enable AST-aware chunking for code files in the data directory",
)
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
"""Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}")
if args.file_types:
@@ -66,12 +66,16 @@ class DocumentRAG(BaseRAGExample):
raise ValueError(f"Data directory not found: {args.data_dir}")
# Load documents
documents = SimpleDirectoryReader(
args.data_dir,
recursive=True,
encoding="utf-8",
required_exts=args.file_types if args.file_types else None,
).load_data(show_progress=True)
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
}
if args.file_types:
reader_kwargs["required_exts"] = args.file_types
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
show_progress=True
)
if not documents:
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")

View File

@@ -127,12 +127,11 @@ class EmlxMboxReader(MboxReader):
def load_data(
self,
file: Path, # Note: for EmlxMboxReader, this is actually a directory
directory: Path,
extra_info: dict | None = None,
fs: AbstractFileSystem | None = None,
) -> list[Document]:
"""Parse .emlx files from directory into strings using MboxReader logic."""
directory = file # Rename for clarity - this is a directory of .emlx files
import os
import tempfile

View File

@@ -5,7 +5,6 @@ Supports Apple Mail on macOS.
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -65,7 +64,7 @@ class EmailRAG(BaseRAGExample):
return messages_dirs
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load emails and convert to text chunks."""
# Determine mail directories
if args.mail_path:

View File

@@ -86,7 +86,7 @@ class WeChatHistoryReader(BaseReader):
text=True,
timeout=5,
)
return result.returncode == 0 and bool(result.stdout.strip())
return result.returncode == 0 and result.stdout.strip()
except Exception:
return False
@@ -314,9 +314,7 @@ class WeChatHistoryReader(BaseReader):
return concatenated_groups
def _create_concatenated_content(
self, message_group: dict, contact_name: str
) -> tuple[str, str]:
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
"""
Create concatenated content from a group of messages.

View File

@@ -14,7 +14,6 @@ import argparse
import pickle
import tempfile
from pathlib import Path
from typing import Any
import numpy as np
from PIL import Image
@@ -66,7 +65,7 @@ class ImageRAG(BaseRAGExample):
help="Batch size for CLIP embedding generation (default: 32)",
)
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load images, generate CLIP embeddings, and return text descriptions."""
self._image_data = self._load_images_and_embeddings(args)
return [entry["text"] for entry in self._image_data]
@@ -169,7 +168,7 @@ class ImageRAG(BaseRAGExample):
print(f"✅ Processed {len(image_data)} images")
return image_data
async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
async def build_index(self, args, texts: list[str]) -> str:
"""Build index using pre-computed CLIP embeddings."""
from leann.api import LeannBuilder

View File

@@ -6,7 +6,6 @@ This example demonstrates how to build a RAG system on your iMessage conversatio
import asyncio
from pathlib import Path
from typing import Any
from leann.chunking_utils import create_text_chunks
@@ -57,7 +56,7 @@ class IMessageRAG(BaseRAGExample):
help="Overlap between text chunks (default: 200)",
)
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load iMessage history and convert to text chunks."""
print("Loading iMessage conversation history...")

View File

@@ -18,11 +18,10 @@ _repo_root = Path(__file__).resolve().parents[3]
_leann_core_src = _repo_root / "packages" / "leann-core" / "src"
_leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
if str(_leann_core_src) not in sys.path:
sys.path.insert(0, str(_leann_core_src))
sys.path.append(str(_leann_core_src))
if str(_leann_hnsw_pkg) not in sys.path:
sys.path.insert(0, str(_leann_hnsw_pkg))
sys.path.append(str(_leann_hnsw_pkg))
from leann_multi_vector import LeannMultiVector
import torch
from colpali_engine.models import ColPali
@@ -94,9 +93,9 @@ for batch_doc in tqdm(dataloader):
print(ds[0].shape)
# %%
# Build HNSW index via LeannMultiVector primitives and run search
# Build HNSW index via LeannRetriever primitives and run search
index_path = "./indexes/colpali.leann"
retriever = LeannMultiVector(index_path=index_path, dim=int(ds[0].shape[-1]))
retriever = LeannRetriever(index_path=index_path, dim=int(ds[0].shape[-1]))
retriever.create_collection()
filepaths = [os.path.join("./pages", name) for name in page_filenames]
for i in range(len(filepaths)):

View File

@@ -5,7 +5,7 @@ import argparse
import faulthandler
import os
import time
from typing import Any, Optional, cast
from typing import Any, Optional
import numpy as np
from PIL import Image
@@ -223,7 +223,7 @@ if need_to_build_index:
# Use filenames as identifiers instead of full paths for cleaner metadata
filepaths = [os.path.basename(fp) for fp in filepaths]
elif USE_HF_DATASET:
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from datasets import load_dataset, concatenate_datasets, DatasetDict
# Determine which datasets to load
if DATASET_NAMES is not None:
@@ -281,12 +281,12 @@ if need_to_build_index:
splits_to_load = DATASET_SPLITS
# Load and concatenate multiple splits for this dataset
datasets_to_concat: list[Dataset] = []
datasets_to_concat = []
for split in splits_to_load:
if split not in dataset_dict:
print(f" Warning: Split '{split}' not found in dataset. Available splits: {list(dataset_dict.keys())}")
continue
split_dataset = cast(Dataset, dataset_dict[split])
split_dataset = dataset_dict[split]
print(f" Loaded split '{split}': {len(split_dataset)} pages")
datasets_to_concat.append(split_dataset)

View File

@@ -25,9 +25,9 @@ Usage:
import argparse
import json
import os
from typing import Any, Optional, cast
from typing import Optional
from datasets import Dataset, load_dataset
from datasets import load_dataset
from leann_multi_vector import (
ViDoReBenchmarkEvaluator,
_ensure_repo_paths_importable,
@@ -151,43 +151,40 @@ def load_vidore_v1_data(
"""
print(f"Loading dataset: {dataset_path} (split={split})")
# Load queries - cast to Dataset since we know split returns Dataset not DatasetDict
query_ds = cast(Dataset, load_dataset(dataset_path, "queries", split=split, revision=revision))
# Load queries
query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
queries: dict[str, str] = {}
queries = {}
for row in query_ds:
row_dict = cast(dict[str, Any], row)
query_id = f"query-{split}-{row_dict['query-id']}"
queries[query_id] = row_dict["query"]
query_id = f"query-{split}-{row['query-id']}"
queries[query_id] = row["query"]
# Load corpus (images) - cast to Dataset
corpus_ds = cast(Dataset, load_dataset(dataset_path, "corpus", split=split, revision=revision))
# Load corpus (images)
corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
corpus: dict[str, Any] = {}
corpus = {}
for row in corpus_ds:
row_dict = cast(dict[str, Any], row)
corpus_id = f"corpus-{split}-{row_dict['corpus-id']}"
corpus_id = f"corpus-{split}-{row['corpus-id']}"
# Extract image from the dataset row
if "image" in row_dict:
corpus[corpus_id] = row_dict["image"]
elif "page_image" in row_dict:
corpus[corpus_id] = row_dict["page_image"]
if "image" in row:
corpus[corpus_id] = row["image"]
elif "page_image" in row:
corpus[corpus_id] = row["page_image"]
else:
raise ValueError(
f"No image field found in corpus. Available fields: {list(row_dict.keys())}"
f"No image field found in corpus. Available fields: {list(row.keys())}"
)
# Load qrels (relevance judgments) - cast to Dataset
qrels_ds = cast(Dataset, load_dataset(dataset_path, "qrels", split=split, revision=revision))
# Load qrels (relevance judgments)
qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
qrels: dict[str, dict[str, int]] = {}
qrels = {}
for row in qrels_ds:
row_dict = cast(dict[str, Any], row)
query_id = f"query-{split}-{row_dict['query-id']}"
corpus_id = f"corpus-{split}-{row_dict['corpus-id']}"
query_id = f"query-{split}-{row['query-id']}"
corpus_id = f"corpus-{split}-{row['corpus-id']}"
if query_id not in qrels:
qrels[query_id] = {}
qrels[query_id][corpus_id] = int(row_dict["score"])
qrels[query_id][corpus_id] = int(row["score"])
print(
f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
@@ -237,8 +234,8 @@ def evaluate_task(
raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V1_TASKS.keys())}")
task_config = VIDORE_V1_TASKS[task_name]
dataset_path = str(task_config["dataset_path"])
revision = str(task_config["revision"])
dataset_path = task_config["dataset_path"]
revision = task_config["revision"]
# Load data
corpus, queries, qrels = load_vidore_v1_data(
@@ -289,7 +286,7 @@ def evaluate_task(
)
# Search queries
task_prompt = cast(Optional[dict[str, str]], task_config.get("prompt"))
task_prompt = task_config.get("prompt")
results = evaluator.search_queries(
queries=queries,
corpus_ids=corpus_ids_ordered,

View File

@@ -25,9 +25,9 @@ Usage:
import argparse
import json
import os
from typing import Any, Optional, cast
from typing import Optional
from datasets import Dataset, load_dataset
from datasets import load_dataset
from leann_multi_vector import (
ViDoReBenchmarkEvaluator,
_ensure_repo_paths_importable,
@@ -91,8 +91,8 @@ def load_vidore_v2_data(
"""
print(f"Loading dataset: {dataset_path} (split={split}, language={language})")
# Load queries - cast to Dataset since we know split returns Dataset not DatasetDict
query_ds = cast(Dataset, load_dataset(dataset_path, "queries", split=split, revision=revision))
# Load queries
query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
# Check if dataset has language field before filtering
has_language_field = len(query_ds) > 0 and "language" in query_ds.column_names
@@ -112,9 +112,8 @@ def load_vidore_v2_data(
if len(query_ds_filtered) == 0:
# Try to get a sample to see actual language values
try:
sample_ds = cast(
Dataset,
load_dataset(dataset_path, "queries", split=split, revision=revision),
sample_ds = load_dataset(
dataset_path, "queries", split=split, revision=revision
)
if len(sample_ds) > 0 and "language" in sample_ds.column_names:
sample_langs = set(sample_ds["language"])
@@ -127,40 +126,37 @@ def load_vidore_v2_data(
)
query_ds = query_ds_filtered
queries: dict[str, str] = {}
queries = {}
for row in query_ds:
row_dict = cast(dict[str, Any], row)
query_id = f"query-{split}-{row_dict['query-id']}"
queries[query_id] = row_dict["query"]
query_id = f"query-{split}-{row['query-id']}"
queries[query_id] = row["query"]
# Load corpus (images) - cast to Dataset
corpus_ds = cast(Dataset, load_dataset(dataset_path, "corpus", split=split, revision=revision))
# Load corpus (images)
corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
corpus: dict[str, Any] = {}
corpus = {}
for row in corpus_ds:
row_dict = cast(dict[str, Any], row)
corpus_id = f"corpus-{split}-{row_dict['corpus-id']}"
corpus_id = f"corpus-{split}-{row['corpus-id']}"
# Extract image from the dataset row
if "image" in row_dict:
corpus[corpus_id] = row_dict["image"]
elif "page_image" in row_dict:
corpus[corpus_id] = row_dict["page_image"]
if "image" in row:
corpus[corpus_id] = row["image"]
elif "page_image" in row:
corpus[corpus_id] = row["page_image"]
else:
raise ValueError(
f"No image field found in corpus. Available fields: {list(row_dict.keys())}"
f"No image field found in corpus. Available fields: {list(row.keys())}"
)
# Load qrels (relevance judgments) - cast to Dataset
qrels_ds = cast(Dataset, load_dataset(dataset_path, "qrels", split=split, revision=revision))
# Load qrels (relevance judgments)
qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
qrels: dict[str, dict[str, int]] = {}
qrels = {}
for row in qrels_ds:
row_dict = cast(dict[str, Any], row)
query_id = f"query-{split}-{row_dict['query-id']}"
corpus_id = f"corpus-{split}-{row_dict['corpus-id']}"
query_id = f"query-{split}-{row['query-id']}"
corpus_id = f"corpus-{split}-{row['corpus-id']}"
if query_id not in qrels:
qrels[query_id] = {}
qrels[query_id][corpus_id] = int(row_dict["score"])
qrels[query_id][corpus_id] = int(row["score"])
print(
f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
@@ -208,13 +204,13 @@ def evaluate_task(
raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V2_TASKS.keys())}")
task_config = VIDORE_V2_TASKS[task_name]
dataset_path = str(task_config["dataset_path"])
revision = str(task_config["revision"])
dataset_path = task_config["dataset_path"]
revision = task_config["revision"]
# Determine language
if language is None:
# Use first language if multiple available
languages = cast(Optional[list[str]], task_config.get("languages"))
languages = task_config.get("languages")
if languages is None:
# Task doesn't support language filtering (e.g., Vidore2ESGReportsHLRetrieval)
language = None
@@ -273,7 +269,7 @@ def evaluate_task(
)
# Search queries
task_prompt = cast(Optional[dict[str, str]], task_config.get("prompt"))
task_prompt = task_config.get("prompt")
results = evaluator.search_queries(
queries=queries,
corpus_ids=corpus_ids_ordered,

View File

@@ -177,9 +177,7 @@ class SlackMCPReader:
break
# If we get here, all retries failed or it's not a retryable error
if last_exception is not None:
raise last_exception
raise RuntimeError("Unexpected error: no exception captured during retry loop")
raise last_exception
async def fetch_slack_messages(
self, channel: Optional[str] = None, limit: int = 100
@@ -269,10 +267,7 @@ class SlackMCPReader:
messages = json.loads(content["text"])
except json.JSONDecodeError:
# If not JSON, try to parse as CSV format (Slack MCP server format)
text_content = content.get("text", "")
messages = self._parse_csv_messages(
text_content if text_content else "", channel or "unknown"
)
messages = self._parse_csv_messages(content["text"], channel)
else:
messages = result["content"]
else:

View File

@@ -11,7 +11,6 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.slack_data.slack_mcp_reader import SlackMCPReader
@@ -140,7 +139,7 @@ class SlackMCPRAG(BaseRAGExample):
print("4. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load Slack messages via MCP server."""
print(f"Connecting to Slack MCP server: {args.mcp_server}")
@@ -189,8 +188,7 @@ class SlackMCPRAG(BaseRAGExample):
print(sample_text)
print("-" * 40)
# Convert strings to dict format expected by base class
return [{"text": text, "metadata": {"source": "slack"}} for text in texts]
return texts
except Exception as e:
print(f"Error loading Slack data: {e}")

View File

@@ -11,7 +11,6 @@ Usage:
import argparse
import asyncio
from typing import Any
from apps.base_rag_example import BaseRAGExample
from apps.twitter_data.twitter_mcp_reader import TwitterMCPReader
@@ -117,7 +116,7 @@ class TwitterMCPRAG(BaseRAGExample):
print("5. Try running the MCP server command directly to test it")
return False
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load Twitter bookmarks via MCP server."""
print(f"Connecting to Twitter MCP server: {args.mcp_server}")
@@ -157,8 +156,7 @@ class TwitterMCPRAG(BaseRAGExample):
print(sample_text)
print("-" * 50)
# Convert strings to dict format expected by base class
return [{"text": text, "metadata": {"source": "twitter"}} for text in texts]
return texts
except Exception as e:
print(f"❌ Error loading Twitter bookmarks: {e}")

View File

@@ -6,7 +6,6 @@ Supports WeChat chat history export and search.
import subprocess
import sys
from pathlib import Path
from typing import Any
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
@@ -92,7 +91,7 @@ class WeChatRAG(BaseRAGExample):
print(f"Export error: {e}")
return False
async def load_data(self, args) -> list[dict[str, Any]]:
async def load_data(self, args) -> list[str]:
"""Load WeChat history and convert to text chunks."""
# Initialize WeChat reader with export capabilities
reader = WeChatHistoryReader()

View File

@@ -7,7 +7,7 @@ name = "leann-core"
version = "0.3.5"
description = "Core API and plugin system for LEANN"
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.9"
license = { text = "MIT" }
# All required dependencies included

View File

@@ -239,11 +239,11 @@ def create_ast_chunks(
chunks = chunk_builder.chunkify(code_content)
for chunk in chunks:
chunk_text: str | None = None
astchunk_metadata: dict[str, Any] = {}
chunk_text = None
astchunk_metadata = {}
if hasattr(chunk, "text"):
chunk_text = str(chunk.text) if chunk.text else None
chunk_text = chunk.text
elif isinstance(chunk, str):
chunk_text = chunk
elif isinstance(chunk, dict):

View File

@@ -19,7 +19,7 @@ from .settings import (
)
def extract_pdf_text_with_pymupdf(file_path: str) -> str | None:
def extract_pdf_text_with_pymupdf(file_path: str) -> str:
"""Extract text from PDF using PyMuPDF for better quality."""
try:
import fitz # PyMuPDF
@@ -35,7 +35,7 @@ def extract_pdf_text_with_pymupdf(file_path: str) -> str | None:
return None
def extract_pdf_text_with_pdfplumber(file_path: str) -> str | None:
def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
"""Extract text from PDF using pdfplumber for better quality."""
try:
import pdfplumber

View File

@@ -451,8 +451,7 @@ def compute_embeddings_sentence_transformers(
# TODO: Haven't tested this yet
torch.set_num_threads(min(8, os.cpu_count() or 4))
try:
# PyTorch's ContextProp type is complex; cast for type checker
torch.backends.mkldnn.enabled = True # type: ignore[assignment]
torch.backends.mkldnn.enabled = True
except AttributeError:
pass

View File

@@ -11,15 +11,14 @@ from pathlib import Path
from typing import Callable, Optional
# Try to import readline with fallback for Windows
HAS_READLINE = False
readline = None # type: ignore[assignment]
try:
import readline # type: ignore[no-redef]
import readline
HAS_READLINE = True
except ImportError:
# Windows doesn't have readline by default
pass
HAS_READLINE = False
readline = None
class InteractiveSession:

View File

@@ -7,7 +7,7 @@ operators for different data types including numbers, strings, booleans, and lis
"""
import logging
from typing import Any, Optional, Union
from typing import Any, Union
logger = logging.getLogger(__name__)
@@ -47,7 +47,7 @@ class MetadataFilterEngine:
}
def apply_filters(
self, search_results: list[dict[str, Any]], metadata_filters: Optional[MetadataFilters]
self, search_results: list[dict[str, Any]], metadata_filters: MetadataFilters
) -> list[dict[str, Any]]:
"""
Apply metadata filters to a list of search results.

View File

@@ -56,9 +56,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
with open(meta_path, encoding="utf-8") as f:
return json.load(f)
def _ensure_server_running(
self, passages_source_file: str, port: Optional[int], **kwargs
) -> int:
def _ensure_server_running(self, passages_source_file: str, port: int, **kwargs) -> int:
"""
Ensures the embedding server is running if recompute is needed.
This is a helper for subclasses.
@@ -83,7 +81,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
}
server_started, actual_port = self.embedding_server_manager.start_server(
port=port if port is not None else 5557,
port=port,
model_name=self.embedding_model,
embedding_mode=self.embedding_mode,
passages_file=passages_source_file,
@@ -100,7 +98,7 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
self,
query: str,
use_server_if_available: bool = True,
zmq_port: Optional[int] = None,
zmq_port: int = 5557,
query_template: Optional[str] = None,
) -> np.ndarray:
"""

View File

@@ -7,7 +7,7 @@ name = "leann"
version = "0.3.5"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md"
requires-python = ">=3.10"
requires-python = ">=3.9"
license = { text = "MIT" }
authors = [
{ name = "LEANN Team" }
@@ -18,10 +18,10 @@ classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
# Default installation: core + hnsw + diskann

View File

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann-workspace"
version = "0.1.0"
requires-python = ">=3.10"
requires-python = ">=3.9"
dependencies = [
"leann-core",
@@ -157,19 +157,6 @@ exclude = ["localhost", "127.0.0.1", "example.com"]
exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"]
scheme = ["https", "http"]
[tool.ty]
# Type checking with ty (Astral's fast Python type checker)
# ty is 10-100x faster than mypy. See: https://docs.astral.sh/ty/
[tool.ty.environment]
python-version = "3.11"
extra-paths = ["apps", "packages/leann-core/src"]
[tool.ty.rules]
# Disable some noisy rules that have many false positives
possibly-missing-attribute = "ignore"
unresolved-import = "ignore" # Many optional dependencies
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]

View File

@@ -91,7 +91,7 @@ def test_large_index():
builder.build_index(index_path)
searcher = LeannSearcher(index_path)
results = searcher.search("word10 word20", top_k=10)
assert len(results) == 10
results = searcher.search(["word10 word20"], top_k=10)
assert len(results[0]) == 10
# Cleanup
searcher.cleanup()

View File

@@ -123,7 +123,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
cli = LeannCLI()
# Mock load_documents to return a document so builder is created
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()
@@ -175,7 +175,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
cli = LeannCLI()
# Mock load_documents to return a document so builder is created
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()
@@ -230,7 +230,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
cli = LeannCLI()
# Mock load_documents to return a document so builder is created
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()
@@ -307,7 +307,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
cli = LeannCLI()
# Mock load_documents to return a document so builder is created
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()
@@ -376,7 +376,7 @@ class TestPromptTemplateStoredInEmbeddingOptions:
cli = LeannCLI()
# Mock load_documents to return a document so builder is created
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()
@@ -432,7 +432,7 @@ class TestPromptTemplateFlowsToComputeEmbeddings:
cli = LeannCLI()
# Mock load_documents to return a simple document
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}]) # type: ignore[assignment]
cli.load_documents = Mock(return_value=[{"text": "test content", "metadata": {}}])
parser = cli.create_parser()

View File

@@ -67,7 +67,7 @@ def check_lmstudio_available() -> bool:
return False
def get_lmstudio_first_model() -> str | None:
def get_lmstudio_first_model() -> str:
"""Get the first available model from LM Studio."""
try:
response = requests.get("http://localhost:1234/v1/models", timeout=5.0)
@@ -91,7 +91,6 @@ class TestPromptTemplateOpenAI:
model_name = get_lmstudio_first_model()
if not model_name:
pytest.skip("No models loaded in LM Studio")
assert model_name is not None # Type narrowing for type checker
texts = ["artificial intelligence", "machine learning"]
prompt_template = "search_query: "
@@ -121,7 +120,6 @@ class TestPromptTemplateOpenAI:
model_name = get_lmstudio_first_model()
if not model_name:
pytest.skip("No models loaded in LM Studio")
assert model_name is not None # Type narrowing for type checker
text = "machine learning"
base_url = "http://localhost:1234/v1"
@@ -273,7 +271,6 @@ class TestLMStudioSDK:
model_name = get_lmstudio_first_model()
if not model_name:
pytest.skip("No models loaded in LM Studio")
assert model_name is not None # Type narrowing for type checker
try:
from leann.embedding_compute import _query_lmstudio_context_limit

View File

@@ -581,18 +581,7 @@ class TestQueryTemplateApplicationInComputeEmbedding:
# Create a concrete implementation for testing
class TestSearcher(BaseSearcher):
def search(
self,
query,
top_k,
complexity=64,
beam_width=1,
prune_ratio=0.0,
recompute_embeddings=False,
pruning_strategy="global",
zmq_port=None,
**kwargs,
):
def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
return {"labels": [], "distances": []}
searcher = object.__new__(TestSearcher)
@@ -636,18 +625,7 @@ class TestQueryTemplateApplicationInComputeEmbedding:
# Create a concrete implementation for testing
class TestSearcher(BaseSearcher):
def search(
self,
query,
top_k,
complexity=64,
beam_width=1,
prune_ratio=0.0,
recompute_embeddings=False,
pruning_strategy="global",
zmq_port=None,
**kwargs,
):
def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
return {"labels": [], "distances": []}
searcher = object.__new__(TestSearcher)
@@ -693,18 +671,7 @@ class TestQueryTemplateApplicationInComputeEmbedding:
from leann.searcher_base import BaseSearcher
class TestSearcher(BaseSearcher):
def search(
self,
query,
top_k,
complexity=64,
beam_width=1,
prune_ratio=0.0,
recompute_embeddings=False,
pruning_strategy="global",
zmq_port=None,
**kwargs,
):
def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
return {"labels": [], "distances": []}
searcher = object.__new__(TestSearcher)
@@ -743,18 +710,7 @@ class TestQueryTemplateApplicationInComputeEmbedding:
from leann.searcher_base import BaseSearcher
class TestSearcher(BaseSearcher):
def search(
self,
query,
top_k,
complexity=64,
beam_width=1,
prune_ratio=0.0,
recompute_embeddings=False,
pruning_strategy="global",
zmq_port=None,
**kwargs,
):
def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
return {"labels": [], "distances": []}
searcher = object.__new__(TestSearcher)
@@ -818,18 +774,7 @@ class TestQueryTemplateApplicationInComputeEmbedding:
from leann.searcher_base import BaseSearcher
class TestSearcher(BaseSearcher):
def search(
self,
query,
top_k,
complexity=64,
beam_width=1,
prune_ratio=0.0,
recompute_embeddings=False,
pruning_strategy="global",
zmq_port=None,
**kwargs,
):
def search(self, query_vectors, top_k, complexity, beam_width=1, **kwargs):
return {"labels": [], "distances": []}
searcher = object.__new__(TestSearcher)

View File

@@ -97,17 +97,17 @@ def test_backend_options():
with tempfile.TemporaryDirectory() as temp_dir:
# Use smaller model in CI to avoid memory issues
is_ci = os.environ.get("CI") == "true"
embedding_model = (
"sentence-transformers/all-MiniLM-L6-v2" if is_ci else "facebook/contriever"
)
dimensions = 384 if is_ci else None
if os.environ.get("CI") == "true":
model_args = {
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"dimensions": 384,
}
else:
model_args = {}
# Test HNSW backend (as shown in README)
hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
builder_hnsw = LeannBuilder(
backend_name="hnsw", embedding_model=embedding_model, dimensions=dimensions
)
builder_hnsw = LeannBuilder(backend_name="hnsw", **model_args)
builder_hnsw.add_text("Test document for HNSW backend")
builder_hnsw.build_index(hnsw_path)
assert Path(hnsw_path).parent.exists()
@@ -115,9 +115,7 @@ def test_backend_options():
# Test DiskANN backend (mentioned as available option)
diskann_path = str(Path(temp_dir) / "test_diskann.leann")
builder_diskann = LeannBuilder(
backend_name="diskann", embedding_model=embedding_model, dimensions=dimensions
)
builder_diskann = LeannBuilder(backend_name="diskann", **model_args)
builder_diskann.add_text("Test document for DiskANN backend")
builder_diskann.build_index(diskann_path)
assert Path(diskann_path).parent.exists()

1163
uv.lock generated
View File

File diff suppressed because it is too large Load Diff