Compare commits
6 Commits
feat/diska
...
fix/openai
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4b4b825fec | ||
|
|
34ef0db42f | ||
|
|
41812c7d22 | ||
|
|
2047a1a128 | ||
|
|
402e8f97ad | ||
|
|
9a5c197acd |
61
.github/workflows/build-reusable.yml
vendored
61
.github/workflows/build-reusable.yml
vendored
@@ -97,8 +97,7 @@ jobs:
|
|||||||
- name: Install system dependencies (macOS)
|
- name: Install system dependencies (macOS)
|
||||||
if: runner.os == 'macOS'
|
if: runner.os == 'macOS'
|
||||||
run: |
|
run: |
|
||||||
# Don't install LLVM, use system clang for better compatibility
|
brew install llvm libomp boost protobuf zeromq
|
||||||
brew install libomp boost protobuf zeromq
|
|
||||||
|
|
||||||
- name: Install build dependencies
|
- name: Install build dependencies
|
||||||
run: |
|
run: |
|
||||||
@@ -121,11 +120,7 @@ jobs:
|
|||||||
# Build HNSW backend
|
# Build HNSW backend
|
||||||
cd packages/leann-backend-hnsw
|
cd packages/leann-backend-hnsw
|
||||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||||
# Use system clang instead of homebrew LLVM for better compatibility
|
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
|
||||||
export CC=clang
|
|
||||||
export CXX=clang++
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
|
||||||
uv build --wheel --python python
|
|
||||||
else
|
else
|
||||||
uv build --wheel --python python
|
uv build --wheel --python python
|
||||||
fi
|
fi
|
||||||
@@ -134,12 +129,7 @@ jobs:
|
|||||||
# Build DiskANN backend
|
# Build DiskANN backend
|
||||||
cd packages/leann-backend-diskann
|
cd packages/leann-backend-diskann
|
||||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||||
# Use system clang instead of homebrew LLVM for better compatibility
|
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
|
||||||
export CC=clang
|
|
||||||
export CXX=clang++
|
|
||||||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
|
||||||
uv build --wheel --python python
|
|
||||||
else
|
else
|
||||||
uv build --wheel --python python
|
uv build --wheel --python python
|
||||||
fi
|
fi
|
||||||
@@ -199,51 +189,6 @@ jobs:
|
|||||||
echo "📦 Built packages:"
|
echo "📦 Built packages:"
|
||||||
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
||||||
|
|
||||||
- name: Install built packages for testing
|
|
||||||
run: |
|
|
||||||
# Create a virtual environment
|
|
||||||
uv venv
|
|
||||||
source .venv/bin/activate || source .venv/Scripts/activate
|
|
||||||
|
|
||||||
# Install the built wheels
|
|
||||||
# Use --find-links to let uv choose the correct wheel for the platform
|
|
||||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
|
||||||
uv pip install leann-core --find-links packages/leann-core/dist
|
|
||||||
uv pip install leann --find-links packages/leann/dist
|
|
||||||
fi
|
|
||||||
uv pip install leann-backend-hnsw --find-links packages/leann-backend-hnsw/dist
|
|
||||||
uv pip install leann-backend-diskann --find-links packages/leann-backend-diskann/dist
|
|
||||||
|
|
||||||
# Install test dependencies using extras
|
|
||||||
uv pip install -e ".[test]"
|
|
||||||
|
|
||||||
- name: Run tests with pytest
|
|
||||||
env:
|
|
||||||
CI: true # Mark as CI environment to skip memory-intensive tests
|
|
||||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
||||||
HF_HUB_DISABLE_SYMLINKS: 1
|
|
||||||
TOKENIZERS_PARALLELISM: false
|
|
||||||
PYTORCH_ENABLE_MPS_FALLBACK: 0 # Disable MPS on macOS CI to avoid memory issues
|
|
||||||
OMP_NUM_THREADS: 1 # Disable OpenMP parallelism to avoid libomp crashes
|
|
||||||
MKL_NUM_THREADS: 1 # Single thread for MKL operations
|
|
||||||
run: |
|
|
||||||
# Activate virtual environment
|
|
||||||
source .venv/bin/activate || source .venv/Scripts/activate
|
|
||||||
|
|
||||||
# Run all tests
|
|
||||||
pytest tests/
|
|
||||||
|
|
||||||
- name: Run sanity checks (optional)
|
|
||||||
run: |
|
|
||||||
# Activate virtual environment
|
|
||||||
source .venv/bin/activate || source .venv/Scripts/activate
|
|
||||||
|
|
||||||
# Run distance function tests if available
|
|
||||||
if [ -f test/sanity_checks/test_distance_functions.py ]; then
|
|
||||||
echo "Running distance function sanity checks..."
|
|
||||||
python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -86,5 +86,3 @@ packages/leann-backend-diskann/third_party/DiskANN/_deps/
|
|||||||
*.passages.json
|
*.passages.json
|
||||||
|
|
||||||
batchtest.py
|
batchtest.py
|
||||||
tests/__pytest_cache__/
|
|
||||||
tests/__pycache__/
|
|
||||||
|
|||||||
29
README.md
29
README.md
@@ -174,28 +174,15 @@ Ask questions directly about your personal PDFs, documents, and any directory co
|
|||||||
<img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
|
<img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
The example below asks a question about summarizing two papers (uses default data in `examples/data`) and this is the easiest example to run here:
|
The example below asks a question about summarizing two papers (uses default data in `examples/data`):
|
||||||
|
|
||||||
```bash
|
```
|
||||||
|
# Or use python directly
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
python ./examples/main_cli_example.py
|
python ./examples/main_cli_example.py
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Use custom index directory
|
|
||||||
python examples/main_cli_example.py --index-dir "./my_custom_index"
|
|
||||||
|
|
||||||
# Use custom data directory
|
|
||||||
python examples/main_cli_example.py --data-dir "./my_documents"
|
|
||||||
|
|
||||||
# Ask a specific question
|
|
||||||
python examples/main_cli_example.py --query "What are the main findings in these papers?"
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
||||||
|
|
||||||
@@ -208,12 +195,12 @@ python examples/main_cli_example.py --query "What are the main findings in these
|
|||||||
|
|
||||||
**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access.
|
**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access.
|
||||||
```bash
|
```bash
|
||||||
python examples/mail_reader_leann.py --query "What's the food I ordered by DoorDash or Uber Eats mostly?"
|
python examples/mail_reader_leann.py --query "What's the food I ordered by doordash or Uber eat mostly?"
|
||||||
```
|
```
|
||||||
**780K email chunks → 78MB storage.** Finally, search your email like you search Google.
|
**780K email chunks → 78MB storage** Finally, search your email like you search Google.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
|
<summary><strong>📋 Click to expand: Command Examples</strong></summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Use default mail path (works for most macOS setups)
|
# Use default mail path (works for most macOS setups)
|
||||||
@@ -255,7 +242,7 @@ python examples/google_history_reader_leann.py --query "Tell me my browser histo
|
|||||||
**38K browser entries → 6MB storage.** Your browser history becomes your personal search engine.
|
**38K browser entries → 6MB storage.** Your browser history becomes your personal search engine.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
|
<summary><strong>📋 Click to expand: Command Examples</strong></summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Use default Chrome profile (auto-finds all profiles)
|
# Use default Chrome profile (auto-finds all profiles)
|
||||||
@@ -332,7 +319,7 @@ Failed to find or export WeChat data. Exiting.
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
|
<summary><strong>📋 Click to expand: Command Examples</strong></summary>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Use default settings (recommended for first run)
|
# Use default settings (recommended for first run)
|
||||||
|
|||||||
@@ -1,98 +0,0 @@
|
|||||||
"""
|
|
||||||
Comparison between Sentence Transformers and OpenAI embeddings
|
|
||||||
|
|
||||||
This example shows how different embedding models handle complex queries
|
|
||||||
and demonstrates the differences between local and API-based embeddings.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from leann.embedding_compute import compute_embeddings
|
|
||||||
|
|
||||||
# OpenAI API key should be set as environment variable
|
|
||||||
# export OPENAI_API_KEY="your-api-key-here"
|
|
||||||
|
|
||||||
# Test data
|
|
||||||
conference_text = "[Title]: COLING 2025 Conference\n[URL]: https://coling2025.org/"
|
|
||||||
browser_text = "[Title]: Browser Use Tool\n[URL]: https://github.com/browser-use"
|
|
||||||
|
|
||||||
# Two queries with same intent but different wording
|
|
||||||
query1 = "Tell me my browser history about some conference i often visit"
|
|
||||||
query2 = "browser history about conference I often visit"
|
|
||||||
|
|
||||||
texts = [query1, query2, conference_text, browser_text]
|
|
||||||
|
|
||||||
|
|
||||||
def cosine_similarity(a, b):
|
|
||||||
return np.dot(a, b) # Already normalized
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_embeddings(embeddings, model_name):
|
|
||||||
print(f"\n=== {model_name} Results ===")
|
|
||||||
|
|
||||||
# Results for Query 1
|
|
||||||
sim1_conf = cosine_similarity(embeddings[0], embeddings[2])
|
|
||||||
sim1_browser = cosine_similarity(embeddings[0], embeddings[3])
|
|
||||||
|
|
||||||
print(f"Query 1: '{query1}'")
|
|
||||||
print(f" → Conference similarity: {sim1_conf:.4f} {'✓' if sim1_conf > sim1_browser else ''}")
|
|
||||||
print(
|
|
||||||
f" → Browser similarity: {sim1_browser:.4f} {'✓' if sim1_browser > sim1_conf else ''}"
|
|
||||||
)
|
|
||||||
print(f" Winner: {'Conference' if sim1_conf > sim1_browser else 'Browser'}")
|
|
||||||
|
|
||||||
# Results for Query 2
|
|
||||||
sim2_conf = cosine_similarity(embeddings[1], embeddings[2])
|
|
||||||
sim2_browser = cosine_similarity(embeddings[1], embeddings[3])
|
|
||||||
|
|
||||||
print(f"\nQuery 2: '{query2}'")
|
|
||||||
print(f" → Conference similarity: {sim2_conf:.4f} {'✓' if sim2_conf > sim2_browser else ''}")
|
|
||||||
print(
|
|
||||||
f" → Browser similarity: {sim2_browser:.4f} {'✓' if sim2_browser > sim2_conf else ''}"
|
|
||||||
)
|
|
||||||
print(f" Winner: {'Conference' if sim2_conf > sim2_browser else 'Browser'}")
|
|
||||||
|
|
||||||
# Show the impact
|
|
||||||
print("\n=== Impact Analysis ===")
|
|
||||||
print(f"Conference similarity change: {sim2_conf - sim1_conf:+.4f}")
|
|
||||||
print(f"Browser similarity change: {sim2_browser - sim1_browser:+.4f}")
|
|
||||||
|
|
||||||
if sim1_conf > sim1_browser and sim2_browser > sim2_conf:
|
|
||||||
print("❌ FLIP: Adding 'browser history' flips winner from Conference to Browser!")
|
|
||||||
elif sim1_conf > sim1_browser and sim2_conf > sim2_browser:
|
|
||||||
print("✅ STABLE: Conference remains winner in both queries")
|
|
||||||
elif sim1_browser > sim1_conf and sim2_browser > sim2_conf:
|
|
||||||
print("✅ STABLE: Browser remains winner in both queries")
|
|
||||||
else:
|
|
||||||
print("🔄 MIXED: Results vary between queries")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"query1_conf": sim1_conf,
|
|
||||||
"query1_browser": sim1_browser,
|
|
||||||
"query2_conf": sim2_conf,
|
|
||||||
"query2_browser": sim2_browser,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Test Sentence Transformers
|
|
||||||
print("Testing Sentence Transformers (facebook/contriever)...")
|
|
||||||
try:
|
|
||||||
st_embeddings = compute_embeddings(texts, "facebook/contriever", mode="sentence-transformers")
|
|
||||||
st_results = analyze_embeddings(st_embeddings, "Sentence Transformers (facebook/contriever)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Sentence Transformers failed: {e}")
|
|
||||||
st_results = None
|
|
||||||
|
|
||||||
# Test OpenAI
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Testing OpenAI (text-embedding-3-small)...")
|
|
||||||
try:
|
|
||||||
openai_embeddings = compute_embeddings(texts, "text-embedding-3-small", mode="openai")
|
|
||||||
openai_results = analyze_embeddings(openai_embeddings, "OpenAI (text-embedding-3-small)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ OpenAI failed: {e}")
|
|
||||||
openai_results = None
|
|
||||||
|
|
||||||
# Compare results
|
|
||||||
if st_results and openai_results:
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("=== COMPARISON SUMMARY ===")
|
|
||||||
@@ -64,19 +64,9 @@ async def main(args):
|
|||||||
|
|
||||||
print("\n[PHASE 2] Starting Leann chat session...")
|
print("\n[PHASE 2] Starting Leann chat session...")
|
||||||
|
|
||||||
# Build llm_config based on command line arguments
|
llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
|
||||||
if args.llm == "simulated":
|
llm_config = {"type": "ollama", "model": "qwen3:8b"}
|
||||||
llm_config = {"type": "simulated"}
|
llm_config = {"type": "openai", "model": "gpt-4o"}
|
||||||
elif args.llm == "ollama":
|
|
||||||
llm_config = {"type": "ollama", "model": args.model, "host": args.host}
|
|
||||||
elif args.llm == "hf":
|
|
||||||
llm_config = {"type": "hf", "model": args.model}
|
|
||||||
elif args.llm == "openai":
|
|
||||||
llm_config = {"type": "openai", "model": args.model}
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown LLM type: {args.llm}")
|
|
||||||
|
|
||||||
print(f"Using LLM: {args.llm} with model: {args.model if args.llm != 'simulated' else 'N/A'}")
|
|
||||||
|
|
||||||
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
||||||
# query = (
|
# query = (
|
||||||
@@ -94,14 +84,14 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--llm",
|
"--llm",
|
||||||
type=str,
|
type=str,
|
||||||
default="openai",
|
default="hf",
|
||||||
choices=["simulated", "ollama", "hf", "openai"],
|
choices=["simulated", "ollama", "hf", "openai"],
|
||||||
help="The LLM backend to use.",
|
help="The LLM backend to use.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
default="gpt-4o",
|
default="Qwen/Qwen3-0.6B",
|
||||||
help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
|
help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from pathlib import Path
|
|||||||
from typing import Any, Literal
|
from typing import Any, Literal
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import psutil
|
|
||||||
from leann.interface import (
|
from leann.interface import (
|
||||||
LeannBackendBuilderInterface,
|
LeannBackendBuilderInterface,
|
||||||
LeannBackendFactoryInterface,
|
LeannBackendFactoryInterface,
|
||||||
@@ -85,43 +84,6 @@ def _write_vectors_to_bin(data: np.ndarray, file_path: Path):
|
|||||||
f.write(data.tobytes())
|
f.write(data.tobytes())
|
||||||
|
|
||||||
|
|
||||||
def _calculate_smart_memory_config(data: np.ndarray) -> tuple[float, float]:
|
|
||||||
"""
|
|
||||||
Calculate smart memory configuration for DiskANN based on data size and system specs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: The embedding data array
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (search_memory_maximum, build_memory_maximum) in GB
|
|
||||||
"""
|
|
||||||
num_vectors, dim = data.shape
|
|
||||||
|
|
||||||
# Calculate embedding storage size
|
|
||||||
embedding_size_bytes = num_vectors * dim * 4 # float32 = 4 bytes
|
|
||||||
embedding_size_gb = embedding_size_bytes / (1024**3)
|
|
||||||
|
|
||||||
# search_memory_maximum: 1/10 of embedding size for optimal PQ compression
|
|
||||||
# This controls Product Quantization size - smaller means more compression
|
|
||||||
search_memory_gb = max(0.1, embedding_size_gb / 10) # At least 100MB
|
|
||||||
|
|
||||||
# build_memory_maximum: Based on available system RAM for sharding control
|
|
||||||
# This controls how much memory DiskANN uses during index construction
|
|
||||||
available_memory_gb = psutil.virtual_memory().available / (1024**3)
|
|
||||||
total_memory_gb = psutil.virtual_memory().total / (1024**3)
|
|
||||||
|
|
||||||
# Use 50% of available memory, but at least 2GB and at most 75% of total
|
|
||||||
build_memory_gb = max(2.0, min(available_memory_gb * 0.5, total_memory_gb * 0.75))
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Smart memory config - Data: {embedding_size_gb:.2f}GB, "
|
|
||||||
f"Search mem: {search_memory_gb:.2f}GB (PQ control), "
|
|
||||||
f"Build mem: {build_memory_gb:.2f}GB (sharding control)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return search_memory_gb, build_memory_gb
|
|
||||||
|
|
||||||
|
|
||||||
@register_backend("diskann")
|
@register_backend("diskann")
|
||||||
class DiskannBackend(LeannBackendFactoryInterface):
|
class DiskannBackend(LeannBackendFactoryInterface):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -159,16 +121,6 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
|||||||
f"Unsupported distance_metric '{build_kwargs.get('distance_metric', 'unknown')}'."
|
f"Unsupported distance_metric '{build_kwargs.get('distance_metric', 'unknown')}'."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate smart memory configuration if not explicitly provided
|
|
||||||
if (
|
|
||||||
"search_memory_maximum" not in build_kwargs
|
|
||||||
or "build_memory_maximum" not in build_kwargs
|
|
||||||
):
|
|
||||||
smart_search_mem, smart_build_mem = _calculate_smart_memory_config(data)
|
|
||||||
else:
|
|
||||||
smart_search_mem = build_kwargs.get("search_memory_maximum", 4.0)
|
|
||||||
smart_build_mem = build_kwargs.get("build_memory_maximum", 8.0)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from . import _diskannpy as diskannpy # type: ignore
|
from . import _diskannpy as diskannpy # type: ignore
|
||||||
|
|
||||||
@@ -179,8 +131,8 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
|||||||
index_prefix,
|
index_prefix,
|
||||||
build_kwargs.get("complexity", 64),
|
build_kwargs.get("complexity", 64),
|
||||||
build_kwargs.get("graph_degree", 32),
|
build_kwargs.get("graph_degree", 32),
|
||||||
build_kwargs.get("search_memory_maximum", smart_search_mem),
|
build_kwargs.get("search_memory_maximum", 4.0),
|
||||||
build_kwargs.get("build_memory_maximum", smart_build_mem),
|
build_kwargs.get("build_memory_maximum", 8.0),
|
||||||
build_kwargs.get("num_threads", 8),
|
build_kwargs.get("num_threads", 8),
|
||||||
build_kwargs.get("pq_disk_bytes", 0),
|
build_kwargs.get("pq_disk_bytes", 0),
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ def create_diskann_embedding_server(
|
|||||||
zmq_port: int = 5555,
|
zmq_port: int = 5555,
|
||||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||||
embedding_mode: str = "sentence-transformers",
|
embedding_mode: str = "sentence-transformers",
|
||||||
distance_metric: str = "l2",
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create and start a ZMQ-based embedding server for DiskANN backend.
|
Create and start a ZMQ-based embedding server for DiskANN backend.
|
||||||
@@ -264,13 +263,6 @@ if __name__ == "__main__":
|
|||||||
choices=["sentence-transformers", "openai", "mlx"],
|
choices=["sentence-transformers", "openai", "mlx"],
|
||||||
help="Embedding backend mode",
|
help="Embedding backend mode",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--distance-metric",
|
|
||||||
type=str,
|
|
||||||
default="l2",
|
|
||||||
choices=["l2", "mips", "cosine"],
|
|
||||||
help="Distance metric for similarity computation",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -280,5 +272,4 @@ if __name__ == "__main__":
|
|||||||
zmq_port=args.zmq_port,
|
zmq_port=args.zmq_port,
|
||||||
model_name=args.model_name,
|
model_name=args.model_name,
|
||||||
embedding_mode=args.embedding_mode,
|
embedding_mode=args.embedding_mode,
|
||||||
distance_metric=args.distance_metric,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-backend-diskann"
|
name = "leann-backend-diskann"
|
||||||
version = "0.1.16"
|
version = "0.1.15"
|
||||||
dependencies = ["leann-core==0.1.16", "numpy", "protobuf>=3.19.0"]
|
dependencies = ["leann-core==0.1.15", "numpy", "protobuf>=3.19.0"]
|
||||||
|
|
||||||
[tool.scikit-build]
|
[tool.scikit-build]
|
||||||
# Key: simplified CMake path
|
# Key: simplified CMake path
|
||||||
|
|||||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: 67a2611ad1...af2a26481e
@@ -10,14 +10,6 @@ if(APPLE)
|
|||||||
set(OpenMP_C_LIB_NAMES "omp")
|
set(OpenMP_C_LIB_NAMES "omp")
|
||||||
set(OpenMP_CXX_LIB_NAMES "omp")
|
set(OpenMP_CXX_LIB_NAMES "omp")
|
||||||
set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
|
set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
|
||||||
|
|
||||||
# Force use of system libc++ to avoid version mismatch
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
|
|
||||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
|
|
||||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++")
|
|
||||||
|
|
||||||
# Set minimum macOS version for better compatibility
|
|
||||||
set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Use system ZeroMQ instead of building from source
|
# Use system ZeroMQ instead of building from source
|
||||||
|
|||||||
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-backend-hnsw"
|
name = "leann-backend-hnsw"
|
||||||
version = "0.1.16"
|
version = "0.1.15"
|
||||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"leann-core==0.1.16",
|
"leann-core==0.1.15",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pyzmq>=23.0.0",
|
"pyzmq>=23.0.0",
|
||||||
"msgpack>=1.0.0",
|
"msgpack>=1.0.0",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann-core"
|
name = "leann-core"
|
||||||
version = "0.1.16"
|
version = "0.1.15"
|
||||||
description = "Core API and plugin system for LEANN"
|
description = "Core API and plugin system for LEANN"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
|
|||||||
@@ -8,10 +8,6 @@ if platform.system() == "Darwin":
|
|||||||
os.environ["MKL_NUM_THREADS"] = "1"
|
os.environ["MKL_NUM_THREADS"] = "1"
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
os.environ["KMP_BLOCKTIME"] = "0"
|
os.environ["KMP_BLOCKTIME"] = "0"
|
||||||
# Additional fixes for PyTorch/sentence-transformers on macOS ARM64 only in CI
|
|
||||||
if os.environ.get("CI") == "true":
|
|
||||||
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
||||||
|
|
||||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||||
from .registry import BACKEND_REGISTRY, autodiscover_backends
|
from .registry import BACKEND_REGISTRY, autodiscover_backends
|
||||||
|
|||||||
@@ -23,11 +23,6 @@ from .registry import BACKEND_REGISTRY
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_registered_backends() -> list[str]:
|
|
||||||
"""Get list of registered backend names."""
|
|
||||||
return list(BACKEND_REGISTRY.keys())
|
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(
|
def compute_embeddings(
|
||||||
chunks: list[str],
|
chunks: list[str],
|
||||||
model_name: str,
|
model_name: str,
|
||||||
|
|||||||
@@ -542,41 +542,14 @@ class HFChat(LLMInterface):
|
|||||||
self.device = "cpu"
|
self.device = "cpu"
|
||||||
logger.info("No GPU detected. Using CPU.")
|
logger.info("No GPU detected. Using CPU.")
|
||||||
|
|
||||||
# Load tokenizer and model with timeout protection
|
# Load tokenizer and model
|
||||||
try:
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
import signal
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
def timeout_handler(signum, frame):
|
torch_dtype=torch.float16 if self.device != "cpu" else torch.float32,
|
||||||
raise TimeoutError("Model download/loading timed out")
|
device_map="auto" if self.device != "cpu" else None,
|
||||||
|
trust_remote_code=True,
|
||||||
# Set timeout for model loading (60 seconds)
|
)
|
||||||
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
|
|
||||||
signal.alarm(60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
logger.info(f"Loading tokenizer for {model_name}...")
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
|
|
||||||
logger.info(f"Loading model {model_name}...")
|
|
||||||
self.model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
model_name,
|
|
||||||
torch_dtype=torch.float16 if self.device != "cpu" else torch.float32,
|
|
||||||
device_map="auto" if self.device != "cpu" else None,
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
logger.info(f"Successfully loaded {model_name}")
|
|
||||||
finally:
|
|
||||||
signal.alarm(0) # Cancel the alarm
|
|
||||||
signal.signal(signal.SIGALRM, old_handler) # Restore old handler
|
|
||||||
|
|
||||||
except TimeoutError:
|
|
||||||
logger.error(f"Model loading timed out for {model_name}")
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Model loading timed out for {model_name}. Please check your internet connection or try a smaller model."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to load model {model_name}: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Move model to device if not using device_map
|
# Move model to device if not using device_map
|
||||||
if self.device != "cpu" and "device_map" not in str(self.model):
|
if self.device != "cpu" and "device_map" not in str(self.model):
|
||||||
|
|||||||
@@ -354,21 +354,13 @@ class EmbeddingServerManager:
|
|||||||
self.server_process.terminate()
|
self.server_process.terminate()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.server_process.wait(timeout=3)
|
self.server_process.wait(timeout=5)
|
||||||
logger.info(f"Server process {self.server_process.pid} terminated.")
|
logger.info(f"Server process {self.server_process.pid} terminated.")
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Server process {self.server_process.pid} did not terminate gracefully within 3 seconds, killing it."
|
f"Server process {self.server_process.pid} did not terminate gracefully, killing it."
|
||||||
)
|
)
|
||||||
self.server_process.kill()
|
self.server_process.kill()
|
||||||
try:
|
|
||||||
self.server_process.wait(timeout=2)
|
|
||||||
logger.info(f"Server process {self.server_process.pid} killed successfully.")
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to kill server process {self.server_process.pid} - it may be hung"
|
|
||||||
)
|
|
||||||
# Don't hang indefinitely
|
|
||||||
|
|
||||||
# Clean up process resources to prevent resource tracker warnings
|
# Clean up process resources to prevent resource tracker warnings
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -5,8 +5,11 @@ LEANN is a revolutionary vector database that democratizes personal AI. Transfor
|
|||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Default installation (includes both HNSW and DiskANN backends)
|
# Default installation (HNSW backend, recommended)
|
||||||
uv pip install leann
|
uv pip install leann
|
||||||
|
|
||||||
|
# With DiskANN backend (for large-scale deployments)
|
||||||
|
uv pip install leann[diskann]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
@@ -16,8 +19,8 @@ from leann import LeannBuilder, LeannSearcher, LeannChat
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
|
INDEX_PATH = str(Path("./").resolve() / "demo.leann")
|
||||||
|
|
||||||
# Build an index (choose backend: "hnsw" or "diskann")
|
# Build an index
|
||||||
builder = LeannBuilder(backend_name="hnsw") # or "diskann" for large-scale deployments
|
builder = LeannBuilder(backend_name="hnsw")
|
||||||
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
|
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
|
||||||
builder.add_text("Tung Tung Tung Sahur called—they need their banana‑crocodile hybrid back")
|
builder.add_text("Tung Tung Tung Sahur called—they need their banana‑crocodile hybrid back")
|
||||||
builder.build_index(INDEX_PATH)
|
builder.build_index(INDEX_PATH)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "leann"
|
name = "leann"
|
||||||
version = "0.1.16"
|
version = "0.1.15"
|
||||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.9"
|
||||||
@@ -24,15 +24,16 @@ classifiers = [
|
|||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Default installation: core + hnsw + diskann
|
# Default installation: core + hnsw
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"leann-core>=0.1.0",
|
"leann-core>=0.1.0",
|
||||||
"leann-backend-hnsw>=0.1.0",
|
"leann-backend-hnsw>=0.1.0",
|
||||||
"leann-backend-diskann>=0.1.0",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
# All backends now included by default
|
diskann = [
|
||||||
|
"leann-backend-diskann>=0.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Repository = "https://github.com/yichuan-w/LEANN"
|
Repository = "https://github.com/yichuan-w/LEANN"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
[project]
|
[project]
|
||||||
name = "leann-workspace"
|
name = "leann-workspace"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.10"
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"leann-core",
|
"leann-core",
|
||||||
@@ -33,8 +33,8 @@ dependencies = [
|
|||||||
# LlamaIndex core and readers - updated versions
|
# LlamaIndex core and readers - updated versions
|
||||||
"llama-index>=0.12.44",
|
"llama-index>=0.12.44",
|
||||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||||
# "llama-index-readers-docling", # Requires Python >= 3.10
|
"llama-index-readers-docling",
|
||||||
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
"llama-index-node-parser-docling",
|
||||||
"llama-index-vector-stores-faiss>=0.4.0",
|
"llama-index-vector-stores-faiss>=0.4.0",
|
||||||
"llama-index-embeddings-huggingface>=0.5.5",
|
"llama-index-embeddings-huggingface>=0.5.5",
|
||||||
# Other dependencies
|
# Other dependencies
|
||||||
@@ -49,7 +49,6 @@ dependencies = [
|
|||||||
dev = [
|
dev = [
|
||||||
"pytest>=7.0",
|
"pytest>=7.0",
|
||||||
"pytest-cov>=4.0",
|
"pytest-cov>=4.0",
|
||||||
"pytest-xdist>=3.0", # For parallel test execution
|
|
||||||
"black>=23.0",
|
"black>=23.0",
|
||||||
"ruff>=0.1.0",
|
"ruff>=0.1.0",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
@@ -57,15 +56,6 @@ dev = [
|
|||||||
"pre-commit>=3.5.0",
|
"pre-commit>=3.5.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
test = [
|
|
||||||
"pytest>=7.0",
|
|
||||||
"pytest-timeout>=2.0",
|
|
||||||
"llama-index-core>=0.12.0",
|
|
||||||
"llama-index-readers-file>=0.4.0",
|
|
||||||
"python-dotenv>=1.0.0",
|
|
||||||
"sentence-transformers>=2.2.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
diskann = [
|
diskann = [
|
||||||
"leann-backend-diskann",
|
"leann-backend-diskann",
|
||||||
]
|
]
|
||||||
@@ -133,24 +123,3 @@ line-ending = "auto"
|
|||||||
dev = [
|
dev = [
|
||||||
"ruff>=0.12.4",
|
"ruff>=0.12.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
|
||||||
testpaths = ["tests"]
|
|
||||||
python_files = ["test_*.py"]
|
|
||||||
python_classes = ["Test*"]
|
|
||||||
python_functions = ["test_*"]
|
|
||||||
markers = [
|
|
||||||
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
||||||
"openai: marks tests that require OpenAI API key",
|
|
||||||
]
|
|
||||||
timeout = 600
|
|
||||||
addopts = [
|
|
||||||
"-v",
|
|
||||||
"--tb=short",
|
|
||||||
"--strict-markers",
|
|
||||||
"--disable-warnings",
|
|
||||||
]
|
|
||||||
env = [
|
|
||||||
"HF_HUB_DISABLE_SYMLINKS=1",
|
|
||||||
"TOKENIZERS_PARALLELISM=false",
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -1,87 +0,0 @@
|
|||||||
# LEANN Tests
|
|
||||||
|
|
||||||
This directory contains automated tests for the LEANN project using pytest.
|
|
||||||
|
|
||||||
## Test Files
|
|
||||||
|
|
||||||
### `test_readme_examples.py`
|
|
||||||
Tests the examples shown in README.md:
|
|
||||||
- The basic example code that users see first
|
|
||||||
- Import statements work correctly
|
|
||||||
- Different backend options (HNSW, DiskANN)
|
|
||||||
- Different LLM configuration options
|
|
||||||
|
|
||||||
### `test_basic.py`
|
|
||||||
Basic functionality tests that verify:
|
|
||||||
- All packages can be imported correctly
|
|
||||||
- C++ extensions (FAISS, DiskANN) load properly
|
|
||||||
- Basic index building and searching works for both HNSW and DiskANN backends
|
|
||||||
- Uses parametrized tests to test both backends
|
|
||||||
|
|
||||||
### `test_main_cli.py`
|
|
||||||
Tests the main CLI example functionality:
|
|
||||||
- Tests with facebook/contriever embeddings
|
|
||||||
- Tests with OpenAI embeddings (if API key is available)
|
|
||||||
- Tests error handling with invalid parameters
|
|
||||||
- Verifies that normalized embeddings are detected and cosine distance is used
|
|
||||||
|
|
||||||
## Running Tests
|
|
||||||
|
|
||||||
### Install test dependencies:
|
|
||||||
```bash
|
|
||||||
# Using extras
|
|
||||||
uv pip install -e ".[test]"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run all tests:
|
|
||||||
```bash
|
|
||||||
pytest tests/
|
|
||||||
|
|
||||||
# Or with coverage
|
|
||||||
pytest tests/ --cov=leann --cov-report=html
|
|
||||||
|
|
||||||
# Run in parallel (faster)
|
|
||||||
pytest tests/ -n auto
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run specific tests:
|
|
||||||
```bash
|
|
||||||
# Only basic tests
|
|
||||||
pytest tests/test_basic.py
|
|
||||||
|
|
||||||
# Only tests that don't require OpenAI
|
|
||||||
pytest tests/ -m "not openai"
|
|
||||||
|
|
||||||
# Skip slow tests
|
|
||||||
pytest tests/ -m "not slow"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run with specific backend:
|
|
||||||
```bash
|
|
||||||
# Test only HNSW backend
|
|
||||||
pytest tests/test_basic.py::test_backend_basic[hnsw]
|
|
||||||
|
|
||||||
# Test only DiskANN backend
|
|
||||||
pytest tests/test_basic.py::test_backend_basic[diskann]
|
|
||||||
```
|
|
||||||
|
|
||||||
## CI/CD Integration
|
|
||||||
|
|
||||||
Tests are automatically run in GitHub Actions:
|
|
||||||
1. After building wheel packages
|
|
||||||
2. On multiple Python versions (3.9 - 3.13)
|
|
||||||
3. On both Ubuntu and macOS
|
|
||||||
4. Using pytest with appropriate markers and flags
|
|
||||||
|
|
||||||
### pytest.ini Configuration
|
|
||||||
|
|
||||||
The `pytest.ini` file configures:
|
|
||||||
- Test discovery paths
|
|
||||||
- Default timeout (600 seconds)
|
|
||||||
- Environment variables (HF_HUB_DISABLE_SYMLINKS, TOKENIZERS_PARALLELISM)
|
|
||||||
- Custom markers for slow and OpenAI tests
|
|
||||||
- Verbose output with short tracebacks
|
|
||||||
|
|
||||||
### Known Issues
|
|
||||||
|
|
||||||
- OpenAI tests are automatically skipped if no API key is provided
|
|
||||||
@@ -1,92 +0,0 @@
|
|||||||
"""
|
|
||||||
Basic functionality tests for CI pipeline using pytest.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_imports():
|
|
||||||
"""Test that all packages can be imported."""
|
|
||||||
|
|
||||||
# Test C++ extensions
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
|
||||||
def test_backend_basic(backend_name):
|
|
||||||
"""Test basic functionality for each backend."""
|
|
||||||
from leann.api import LeannBuilder, LeannSearcher, SearchResult
|
|
||||||
|
|
||||||
# Create temporary directory for index
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
index_path = str(Path(temp_dir) / f"test.{backend_name}")
|
|
||||||
|
|
||||||
# Test with small data
|
|
||||||
texts = [f"This is document {i} about topic {i % 5}" for i in range(100)]
|
|
||||||
|
|
||||||
# Configure builder based on backend
|
|
||||||
if backend_name == "hnsw":
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="hnsw",
|
|
||||||
embedding_model="facebook/contriever",
|
|
||||||
embedding_mode="sentence-transformers",
|
|
||||||
M=16,
|
|
||||||
efConstruction=200,
|
|
||||||
)
|
|
||||||
else: # diskann
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
embedding_model="facebook/contriever",
|
|
||||||
embedding_mode="sentence-transformers",
|
|
||||||
num_neighbors=32,
|
|
||||||
search_list_size=50,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add texts
|
|
||||||
for text in texts:
|
|
||||||
builder.add_text(text)
|
|
||||||
|
|
||||||
# Build index
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# Test search
|
|
||||||
searcher = LeannSearcher(index_path)
|
|
||||||
results = searcher.search("document about topic 2", top_k=5)
|
|
||||||
|
|
||||||
# Verify results
|
|
||||||
assert len(results) > 0
|
|
||||||
assert isinstance(results[0], SearchResult)
|
|
||||||
assert "topic 2" in results[0].text or "document" in results[0].text
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
||||||
)
|
|
||||||
def test_large_index():
|
|
||||||
"""Test with larger dataset."""
|
|
||||||
from leann.api import LeannBuilder, LeannSearcher
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
index_path = str(Path(temp_dir) / "test_large.hnsw")
|
|
||||||
texts = [f"Document {i}: {' '.join([f'word{j}' for j in range(50)])}" for i in range(1000)]
|
|
||||||
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="hnsw",
|
|
||||||
embedding_model="facebook/contriever",
|
|
||||||
embedding_mode="sentence-transformers",
|
|
||||||
)
|
|
||||||
|
|
||||||
for text in texts:
|
|
||||||
builder.add_text(text)
|
|
||||||
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
searcher = LeannSearcher(index_path)
|
|
||||||
results = searcher.search(["word10 word20"], top_k=10)
|
|
||||||
assert len(results[0]) == 10
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
"""
|
|
||||||
Minimal tests for CI that don't require model loading or significant memory.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def test_package_imports():
|
|
||||||
"""Test that all core packages can be imported."""
|
|
||||||
# Core package
|
|
||||||
|
|
||||||
# Backend packages
|
|
||||||
|
|
||||||
# Core modules
|
|
||||||
|
|
||||||
assert True # If we get here, imports worked
|
|
||||||
|
|
||||||
|
|
||||||
def test_cli_help():
|
|
||||||
"""Test that CLI example shows help."""
|
|
||||||
result = subprocess.run(
|
|
||||||
[sys.executable, "examples/main_cli_example.py", "--help"], capture_output=True, text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result.returncode == 0
|
|
||||||
assert "usage:" in result.stdout.lower() or "usage:" in result.stderr.lower()
|
|
||||||
assert "--llm" in result.stdout or "--llm" in result.stderr
|
|
||||||
|
|
||||||
|
|
||||||
def test_backend_registration():
|
|
||||||
"""Test that backends are properly registered."""
|
|
||||||
from leann.api import get_registered_backends
|
|
||||||
|
|
||||||
backends = get_registered_backends()
|
|
||||||
assert "hnsw" in backends
|
|
||||||
assert "diskann" in backends
|
|
||||||
|
|
||||||
|
|
||||||
def test_version_info():
|
|
||||||
"""Test that packages have version information."""
|
|
||||||
import leann
|
|
||||||
import leann_backend_diskann
|
|
||||||
import leann_backend_hnsw
|
|
||||||
|
|
||||||
# Check that packages have __version__ or can be imported
|
|
||||||
assert hasattr(leann, "__version__") or True
|
|
||||||
assert hasattr(leann_backend_hnsw, "__version__") or True
|
|
||||||
assert hasattr(leann_backend_diskann, "__version__") or True
|
|
||||||
@@ -1,120 +0,0 @@
|
|||||||
"""
|
|
||||||
Test main_cli_example functionality using pytest.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def test_data_dir():
|
|
||||||
"""Return the path to test data directory."""
|
|
||||||
return Path("examples/data")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
|
||||||
)
|
|
||||||
def test_main_cli_simulated(test_data_dir):
|
|
||||||
"""Test main_cli with simulated LLM."""
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Use a subdirectory that doesn't exist yet to force index creation
|
|
||||||
index_dir = Path(temp_dir) / "test_index"
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
"examples/main_cli_example.py",
|
|
||||||
"--llm",
|
|
||||||
"simulated",
|
|
||||||
"--embedding-model",
|
|
||||||
"facebook/contriever",
|
|
||||||
"--embedding-mode",
|
|
||||||
"sentence-transformers",
|
|
||||||
"--index-dir",
|
|
||||||
str(index_dir),
|
|
||||||
"--data-dir",
|
|
||||||
str(test_data_dir),
|
|
||||||
"--query",
|
|
||||||
"What is Pride and Prejudice about?",
|
|
||||||
]
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
|
||||||
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
|
|
||||||
|
|
||||||
# Check return code
|
|
||||||
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
||||||
|
|
||||||
# Verify output
|
|
||||||
output = result.stdout + result.stderr
|
|
||||||
assert "Leann index built at" in output or "Using existing index" in output
|
|
||||||
assert "This is a simulated answer" in output
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available")
|
|
||||||
def test_main_cli_openai(test_data_dir):
|
|
||||||
"""Test main_cli with OpenAI embeddings."""
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Use a subdirectory that doesn't exist yet to force index creation
|
|
||||||
index_dir = Path(temp_dir) / "test_index_openai"
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
"examples/main_cli_example.py",
|
|
||||||
"--llm",
|
|
||||||
"simulated", # Use simulated LLM to avoid GPT-4 costs
|
|
||||||
"--embedding-model",
|
|
||||||
"text-embedding-3-small",
|
|
||||||
"--embedding-mode",
|
|
||||||
"openai",
|
|
||||||
"--index-dir",
|
|
||||||
str(index_dir),
|
|
||||||
"--data-dir",
|
|
||||||
str(test_data_dir),
|
|
||||||
"--query",
|
|
||||||
"What is Pride and Prejudice about?",
|
|
||||||
]
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["TOKENIZERS_PARALLELISM"] = "false"
|
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
|
|
||||||
|
|
||||||
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
||||||
|
|
||||||
# Verify cosine distance was used
|
|
||||||
output = result.stdout + result.stderr
|
|
||||||
assert any(
|
|
||||||
msg in output
|
|
||||||
for msg in [
|
|
||||||
"distance_metric='cosine'",
|
|
||||||
"Automatically setting distance_metric='cosine'",
|
|
||||||
"Using cosine distance",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_main_cli_error_handling(test_data_dir):
|
|
||||||
"""Test main_cli with invalid parameters."""
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
"examples/main_cli_example.py",
|
|
||||||
"--llm",
|
|
||||||
"invalid_llm_type",
|
|
||||||
"--index-dir",
|
|
||||||
temp_dir,
|
|
||||||
"--data-dir",
|
|
||||||
str(test_data_dir),
|
|
||||||
]
|
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
||||||
|
|
||||||
# Should fail with invalid LLM type
|
|
||||||
assert result.returncode != 0
|
|
||||||
assert "Unknown LLM type" in result.stderr or "invalid_llm_type" in result.stderr
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
"""
|
|
||||||
Test examples from README.md to ensure documentation is accurate.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import platform
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_readme_basic_example():
|
|
||||||
"""Test the basic example from README.md."""
|
|
||||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
||||||
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
||||||
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
||||||
|
|
||||||
# This is the exact code from README (with smaller model for CI)
|
|
||||||
from leann import LeannBuilder, LeannChat, LeannSearcher
|
|
||||||
from leann.api import SearchResult
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
INDEX_PATH = str(Path(temp_dir) / "demo.leann")
|
|
||||||
|
|
||||||
# Build an index
|
|
||||||
# In CI, use a smaller model to avoid memory issues
|
|
||||||
if os.environ.get("CI") == "true":
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="hnsw",
|
|
||||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Smaller model
|
|
||||||
dimensions=384, # Smaller dimensions
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
builder = LeannBuilder(backend_name="hnsw")
|
|
||||||
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
|
|
||||||
builder.add_text("Tung Tung Tung Sahur called—they need their banana-crocodile hybrid back")
|
|
||||||
builder.build_index(INDEX_PATH)
|
|
||||||
|
|
||||||
# Verify index was created
|
|
||||||
# The index path should be a directory containing index files
|
|
||||||
index_dir = Path(INDEX_PATH).parent
|
|
||||||
assert index_dir.exists()
|
|
||||||
# Check that index files were created
|
|
||||||
index_files = list(index_dir.glob(f"{Path(INDEX_PATH).stem}.*"))
|
|
||||||
assert len(index_files) > 0
|
|
||||||
|
|
||||||
# Search
|
|
||||||
searcher = LeannSearcher(INDEX_PATH)
|
|
||||||
results = searcher.search("fantastical AI-generated creatures", top_k=1)
|
|
||||||
|
|
||||||
# Verify search results
|
|
||||||
assert len(results) > 0
|
|
||||||
assert isinstance(results[0], SearchResult)
|
|
||||||
# The second text about banana-crocodile should be more relevant
|
|
||||||
assert "banana" in results[0].text or "crocodile" in results[0].text
|
|
||||||
|
|
||||||
# Chat with your data (using simulated LLM to avoid external dependencies)
|
|
||||||
chat = LeannChat(INDEX_PATH, llm_config={"type": "simulated"})
|
|
||||||
response = chat.ask("How much storage does LEANN save?", top_k=1)
|
|
||||||
|
|
||||||
# Verify chat works
|
|
||||||
assert isinstance(response, str)
|
|
||||||
assert len(response) > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_readme_imports():
|
|
||||||
"""Test that the imports shown in README work correctly."""
|
|
||||||
# These are the imports shown in README
|
|
||||||
from leann import LeannBuilder, LeannChat, LeannSearcher
|
|
||||||
|
|
||||||
# Verify they are the correct types
|
|
||||||
assert callable(LeannBuilder)
|
|
||||||
assert callable(LeannSearcher)
|
|
||||||
assert callable(LeannChat)
|
|
||||||
|
|
||||||
|
|
||||||
def test_backend_options():
|
|
||||||
"""Test different backend options mentioned in documentation."""
|
|
||||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
||||||
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
||||||
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
||||||
|
|
||||||
from leann import LeannBuilder
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Use smaller model in CI to avoid memory issues
|
|
||||||
if os.environ.get("CI") == "true":
|
|
||||||
model_args = {
|
|
||||||
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
"dimensions": 384,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
model_args = {}
|
|
||||||
|
|
||||||
# Test HNSW backend (as shown in README)
|
|
||||||
hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
|
|
||||||
builder_hnsw = LeannBuilder(backend_name="hnsw", **model_args)
|
|
||||||
builder_hnsw.add_text("Test document for HNSW backend")
|
|
||||||
builder_hnsw.build_index(hnsw_path)
|
|
||||||
assert Path(hnsw_path).parent.exists()
|
|
||||||
assert len(list(Path(hnsw_path).parent.glob(f"{Path(hnsw_path).stem}.*"))) > 0
|
|
||||||
|
|
||||||
# Test DiskANN backend (mentioned as available option)
|
|
||||||
diskann_path = str(Path(temp_dir) / "test_diskann.leann")
|
|
||||||
builder_diskann = LeannBuilder(backend_name="diskann", **model_args)
|
|
||||||
builder_diskann.add_text("Test document for DiskANN backend")
|
|
||||||
builder_diskann.build_index(diskann_path)
|
|
||||||
assert Path(diskann_path).parent.exists()
|
|
||||||
assert len(list(Path(diskann_path).parent.glob(f"{Path(diskann_path).stem}.*"))) > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_llm_config_simulated():
|
|
||||||
"""Test simulated LLM configuration option."""
|
|
||||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
|
||||||
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
|
||||||
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
|
||||||
|
|
||||||
from leann import LeannBuilder, LeannChat
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Build a simple index
|
|
||||||
index_path = str(Path(temp_dir) / "test.leann")
|
|
||||||
# Use smaller model in CI to avoid memory issues
|
|
||||||
if os.environ.get("CI") == "true":
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="hnsw",
|
|
||||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
dimensions=384,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
builder = LeannBuilder(backend_name="hnsw")
|
|
||||||
builder.add_text("Test document for LLM testing")
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# Test simulated LLM config
|
|
||||||
llm_config = {"type": "simulated"}
|
|
||||||
chat = LeannChat(index_path, llm_config=llm_config)
|
|
||||||
response = chat.ask("What is this document about?", top_k=1)
|
|
||||||
|
|
||||||
assert isinstance(response, str)
|
|
||||||
assert len(response) > 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Requires HF model download and may timeout")
|
|
||||||
def test_llm_config_hf():
|
|
||||||
"""Test HuggingFace LLM configuration option."""
|
|
||||||
from leann import LeannBuilder, LeannChat
|
|
||||||
|
|
||||||
pytest.importorskip("transformers") # Skip if transformers not installed
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
|
||||||
# Build a simple index
|
|
||||||
index_path = str(Path(temp_dir) / "test.leann")
|
|
||||||
builder = LeannBuilder(backend_name="hnsw")
|
|
||||||
builder.add_text("Test document for LLM testing")
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# Test HF LLM config
|
|
||||||
llm_config = {"type": "hf", "model": "Qwen/Qwen3-0.6B"}
|
|
||||||
chat = LeannChat(index_path, llm_config=llm_config)
|
|
||||||
response = chat.ask("What is this document about?", top_k=1)
|
|
||||||
|
|
||||||
assert isinstance(response, str)
|
|
||||||
assert len(response) > 0
|
|
||||||
Reference in New Issue
Block a user