Merge remote-tracking branch 'origin/main' into fix/openai-embeddings-cosine-distance

fix: Improve OpenAI embeddings handling in HNSW backend
feat: add --use-existing-index option to google_history_reader_leann.py
2025-07-28 10:17:55 -07:00 · 2025-07-28 10:15:56 -07:00 · 2025-07-28 00:36:57 -07:00 · 2025-07-27 23:10:20 -07:00 · 2025-07-27 20:25:40 -07:00 · 2025-07-27 20:21:05 -07:00
25 changed files with 1427 additions and 2545 deletions
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -97,8 +97,7 @@ jobs:
      - name: Install system dependencies (macOS)
        if: runner.os == 'macOS'
        run: |
-          # Don't install LLVM, use system clang for better compatibility
+          brew install llvm libomp boost protobuf zeromq
          brew install libomp boost protobuf zeromq
      - name: Install build dependencies
        run: |
@@ -121,11 +120,7 @@ jobs:
          # Build HNSW backend
          cd packages/leann-backend-hnsw
          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            # Use system clang instead of homebrew LLVM for better compatibility
+            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
            export CC=clang
            export CXX=clang++
            export MACOSX_DEPLOYMENT_TARGET=11.0
            uv build --wheel --python python
          else
            uv build --wheel --python python
          fi
@@ -134,12 +129,7 @@ jobs:
          # Build DiskANN backend
          cd packages/leann-backend-diskann
          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            # Use system clang instead of homebrew LLVM for better compatibility
+            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
            export CC=clang
            export CXX=clang++
            # DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
            export MACOSX_DEPLOYMENT_TARGET=13.3
            uv build --wheel --python python
          else
            uv build --wheel --python python
          fi
@@ -199,51 +189,6 @@ jobs:
          echo "📦 Built packages:"
          find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
      - name: Install built packages for testing
        run: |
          # Create a virtual environment
          uv venv
          source .venv/bin/activate || source .venv/Scripts/activate
          # Install the built wheels
          # Use --find-links to let uv choose the correct wheel for the platform
          if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
            uv pip install leann-core --find-links packages/leann-core/dist
            uv pip install leann --find-links packages/leann/dist
          fi
          uv pip install leann-backend-hnsw --find-links packages/leann-backend-hnsw/dist
          uv pip install leann-backend-diskann --find-links packages/leann-backend-diskann/dist
          # Install test dependencies using extras
          uv pip install -e ".[test]"
      - name: Run tests with pytest
        env:
          CI: true  # Mark as CI environment to skip memory-intensive tests
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          HF_HUB_DISABLE_SYMLINKS: 1
          TOKENIZERS_PARALLELISM: false
          PYTORCH_ENABLE_MPS_FALLBACK: 0  # Disable MPS on macOS CI to avoid memory issues
          OMP_NUM_THREADS: 1  # Disable OpenMP parallelism to avoid libomp crashes
          MKL_NUM_THREADS: 1  # Single thread for MKL operations
        run: |
          # Activate virtual environment
          source .venv/bin/activate || source .venv/Scripts/activate
          # Run all tests
          pytest tests/
      - name: Run sanity checks (optional)
        run: |
          # Activate virtual environment
          source .venv/bin/activate || source .venv/Scripts/activate
          # Run distance function tests if available
          if [ -f test/sanity_checks/test_distance_functions.py ]; then
            echo "Running distance function sanity checks..."
            python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
          fi
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -86,5 +86,3 @@ packages/leann-backend-diskann/third_party/DiskANN/_deps/
 *.passages.json
 batchtest.py
 tests/__pytest_cache__/
 tests/__pycache__/
--- a/README.md
+++ b/README.md
@@ -174,28 +174,15 @@ Ask questions directly about your personal PDFs, documents, and any directory co
  <img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
 </p>
-The example below asks a question about summarizing two papers (uses default data in `examples/data`) and this is the easiest example to run here:
+The example below asks a question about summarizing two papers (uses default data in `examples/data`):
-```bash
+```
 # Or use python directly
 source .venv/bin/activate
 python ./examples/main_cli_example.py
 ```
 <details>
 <summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
 ```bash
 # Use custom index directory
 python examples/main_cli_example.py --index-dir "./my_custom_index"
 # Use custom data directory
 python examples/main_cli_example.py --data-dir "./my_documents"
 # Ask a specific question
 python examples/main_cli_example.py --query "What are the main findings in these papers?"
 ```
 </details>
 ### 📧 Your Personal Email Secretary: RAG on Apple Mail!
@@ -208,12 +195,12 @@ python examples/main_cli_example.py --query "What are the main findings in these
 **Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access.
 ```bash
-python examples/mail_reader_leann.py --query "What's the food I ordered by DoorDash or Uber Eats mostly?"
+python examples/mail_reader_leann.py --query "What's the food I ordered by doordash or Uber eat mostly?"
 ```
-**780K email chunks → 78MB storage.** Finally, search your email like you search Google.
+**780K email chunks → 78MB storage** Finally, search your email like you search Google.
 <details>
-<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
+<summary><strong>📋 Click to expand: Command Examples</strong></summary>
 ```bash
 # Use default mail path (works for most macOS setups)
@@ -255,7 +242,7 @@ python examples/google_history_reader_leann.py --query "Tell me my browser histo
 **38K browser entries → 6MB storage.** Your browser history becomes your personal search engine.
 <details>
-<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
+<summary><strong>📋 Click to expand: Command Examples</strong></summary>
 ```bash
 # Use default Chrome profile (auto-finds all profiles)
@@ -332,7 +319,7 @@ Failed to find or export WeChat data. Exiting.
 </details>
 <details>
-<summary><strong>📋 Click to expand: User Configurable Arguments</strong></summary>
+<summary><strong>📋 Click to expand: Command Examples</strong></summary>
 ```bash
 # Use default settings (recommended for first run)
--- a/docs/code/embedding_model_compare.py
+++ b/docs/code/embedding_model_compare.py
@@ -1,98 +0,0 @@
 """
 Comparison between Sentence Transformers and OpenAI embeddings
 This example shows how different embedding models handle complex queries
 and demonstrates the differences between local and API-based embeddings.
 """
 import numpy as np
 from leann.embedding_compute import compute_embeddings
 # OpenAI API key should be set as environment variable
 # export OPENAI_API_KEY="your-api-key-here"
 # Test data
 conference_text = "[Title]: COLING 2025 Conference\n[URL]: https://coling2025.org/"
 browser_text = "[Title]: Browser Use Tool\n[URL]: https://github.com/browser-use"
 # Two queries with same intent but different wording
 query1 = "Tell me my browser history about some conference i often visit"
 query2 = "browser history about conference I often visit"
 texts = [query1, query2, conference_text, browser_text]
 def cosine_similarity(a, b):
    return np.dot(a, b)  # Already normalized
 def analyze_embeddings(embeddings, model_name):
    print(f"\n=== {model_name} Results ===")
    # Results for Query 1
    sim1_conf = cosine_similarity(embeddings[0], embeddings[2])
    sim1_browser = cosine_similarity(embeddings[0], embeddings[3])
    print(f"Query 1: '{query1}'")
    print(f"  → Conference similarity: {sim1_conf:.4f} {'✓' if sim1_conf > sim1_browser else ''}")
    print(
        f"  → Browser similarity:    {sim1_browser:.4f} {'✓' if sim1_browser > sim1_conf else ''}"
    )
    print(f"  Winner: {'Conference' if sim1_conf > sim1_browser else 'Browser'}")
    # Results for Query 2
    sim2_conf = cosine_similarity(embeddings[1], embeddings[2])
    sim2_browser = cosine_similarity(embeddings[1], embeddings[3])
    print(f"\nQuery 2: '{query2}'")
    print(f"  → Conference similarity: {sim2_conf:.4f} {'✓' if sim2_conf > sim2_browser else ''}")
    print(
        f"  → Browser similarity:    {sim2_browser:.4f} {'✓' if sim2_browser > sim2_conf else ''}"
    )
    print(f"  Winner: {'Conference' if sim2_conf > sim2_browser else 'Browser'}")
    # Show the impact
    print("\n=== Impact Analysis ===")
    print(f"Conference similarity change: {sim2_conf - sim1_conf:+.4f}")
    print(f"Browser similarity change:    {sim2_browser - sim1_browser:+.4f}")
    if sim1_conf > sim1_browser and sim2_browser > sim2_conf:
        print("❌ FLIP: Adding 'browser history' flips winner from Conference to Browser!")
    elif sim1_conf > sim1_browser and sim2_conf > sim2_browser:
        print("✅ STABLE: Conference remains winner in both queries")
    elif sim1_browser > sim1_conf and sim2_browser > sim2_conf:
        print("✅ STABLE: Browser remains winner in both queries")
    else:
        print("🔄 MIXED: Results vary between queries")
    return {
        "query1_conf": sim1_conf,
        "query1_browser": sim1_browser,
        "query2_conf": sim2_conf,
        "query2_browser": sim2_browser,
    }
 # Test Sentence Transformers
 print("Testing Sentence Transformers (facebook/contriever)...")
 try:
    st_embeddings = compute_embeddings(texts, "facebook/contriever", mode="sentence-transformers")
    st_results = analyze_embeddings(st_embeddings, "Sentence Transformers (facebook/contriever)")
 except Exception as e:
    print(f"❌ Sentence Transformers failed: {e}")
    st_results = None
 # Test OpenAI
 print("\n" + "=" * 60)
 print("Testing OpenAI (text-embedding-3-small)...")
 try:
    openai_embeddings = compute_embeddings(texts, "text-embedding-3-small", mode="openai")
    openai_results = analyze_embeddings(openai_embeddings, "OpenAI (text-embedding-3-small)")
 except Exception as e:
    print(f"❌ OpenAI failed: {e}")
    openai_results = None
 # Compare results
 if st_results and openai_results:
    print("\n" + "=" * 60)
    print("=== COMPARISON SUMMARY ===")
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -64,19 +64,9 @@ async def main(args):
    print("\n[PHASE 2] Starting Leann chat session...")
-    # Build llm_config based on command line arguments
+    llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
-    if args.llm == "simulated":
+    llm_config = {"type": "ollama", "model": "qwen3:8b"}
-        llm_config = {"type": "simulated"}
+    llm_config = {"type": "openai", "model": "gpt-4o"}
    elif args.llm == "ollama":
        llm_config = {"type": "ollama", "model": args.model, "host": args.host}
    elif args.llm == "hf":
        llm_config = {"type": "hf", "model": args.model}
    elif args.llm == "openai":
        llm_config = {"type": "openai", "model": args.model}
    else:
        raise ValueError(f"Unknown LLM type: {args.llm}")
    print(f"Using LLM: {args.llm} with model: {args.model if args.llm != 'simulated' else 'N/A'}")
    chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
    # query = (
@@ -94,14 +84,14 @@ if __name__ == "__main__":
    parser.add_argument(
        "--llm",
        type=str,
-        default="openai",
+        default="hf",
        choices=["simulated", "ollama", "hf", "openai"],
        help="The LLM backend to use.",
    )
    parser.add_argument(
        "--model",
        type=str,
-        default="gpt-4o",
+        default="Qwen/Qwen3-0.6B",
        help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
    )
    parser.add_argument(
--- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_backend.py
@@ -7,7 +7,6 @@ from pathlib import Path
 from typing import Any, Literal
 import numpy as np
 import psutil
 from leann.interface import (
    LeannBackendBuilderInterface,
    LeannBackendFactoryInterface,
@@ -85,43 +84,6 @@ def _write_vectors_to_bin(data: np.ndarray, file_path: Path):
        f.write(data.tobytes())
 def _calculate_smart_memory_config(data: np.ndarray) -> tuple[float, float]:
    """
    Calculate smart memory configuration for DiskANN based on data size and system specs.
    Args:
        data: The embedding data array
    Returns:
        tuple: (search_memory_maximum, build_memory_maximum) in GB
    """
    num_vectors, dim = data.shape
    # Calculate embedding storage size
    embedding_size_bytes = num_vectors * dim * 4  # float32 = 4 bytes
    embedding_size_gb = embedding_size_bytes / (1024**3)
    # search_memory_maximum: 1/10 of embedding size for optimal PQ compression
    # This controls Product Quantization size - smaller means more compression
    search_memory_gb = max(0.1, embedding_size_gb / 10)  # At least 100MB
    # build_memory_maximum: Based on available system RAM for sharding control
    # This controls how much memory DiskANN uses during index construction
    available_memory_gb = psutil.virtual_memory().available / (1024**3)
    total_memory_gb = psutil.virtual_memory().total / (1024**3)
    # Use 50% of available memory, but at least 2GB and at most 75% of total
    build_memory_gb = max(2.0, min(available_memory_gb * 0.5, total_memory_gb * 0.75))
    logger.info(
        f"Smart memory config - Data: {embedding_size_gb:.2f}GB, "
        f"Search mem: {search_memory_gb:.2f}GB (PQ control), "
        f"Build mem: {build_memory_gb:.2f}GB (sharding control)"
    )
    return search_memory_gb, build_memory_gb
@register_backend("diskann")
 class DiskannBackend(LeannBackendFactoryInterface):
    @staticmethod
@@ -159,16 +121,6 @@ class DiskannBuilder(LeannBackendBuilderInterface):
                f"Unsupported distance_metric '{build_kwargs.get('distance_metric', 'unknown')}'."
            )
        # Calculate smart memory configuration if not explicitly provided
        if (
            "search_memory_maximum" not in build_kwargs
            or "build_memory_maximum" not in build_kwargs
        ):
            smart_search_mem, smart_build_mem = _calculate_smart_memory_config(data)
        else:
            smart_search_mem = build_kwargs.get("search_memory_maximum", 4.0)
            smart_build_mem = build_kwargs.get("build_memory_maximum", 8.0)
        try:
            from . import _diskannpy as diskannpy  # type: ignore
@@ -179,8 +131,8 @@ class DiskannBuilder(LeannBackendBuilderInterface):
                    index_prefix,
                    build_kwargs.get("complexity", 64),
                    build_kwargs.get("graph_degree", 32),
-                    build_kwargs.get("search_memory_maximum", smart_search_mem),
+                    build_kwargs.get("search_memory_maximum", 4.0),
-                    build_kwargs.get("build_memory_maximum", smart_build_mem),
+                    build_kwargs.get("build_memory_maximum", 8.0),
                    build_kwargs.get("num_threads", 8),
                    build_kwargs.get("pq_disk_bytes", 0),
                    "",
--- a/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py
+++ b/packages/leann-backend-diskann/leann_backend_diskann/diskann_embedding_server.py
@@ -36,7 +36,6 @@ def create_diskann_embedding_server(
    zmq_port: int = 5555,
    model_name: str = "sentence-transformers/all-mpnet-base-v2",
    embedding_mode: str = "sentence-transformers",
    distance_metric: str = "l2",
 ):
    """
    Create and start a ZMQ-based embedding server for DiskANN backend.
@@ -264,13 +263,6 @@ if __name__ == "__main__":
        choices=["sentence-transformers", "openai", "mlx"],
        help="Embedding backend mode",
    )
    parser.add_argument(
        "--distance-metric",
        type=str,
        default="l2",
        choices=["l2", "mips", "cosine"],
        help="Distance metric for similarity computation",
    )
    args = parser.parse_args()
@@ -280,5 +272,4 @@ if __name__ == "__main__":
        zmq_port=args.zmq_port,
        model_name=args.model_name,
        embedding_mode=args.embedding_mode,
        distance_metric=args.distance_metric,
    )
--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-diskann"
-version = "0.1.16"
+version = "0.1.15"
-dependencies = ["leann-core==0.1.16", "numpy", "protobuf>=3.19.0"]
+dependencies = ["leann-core==0.1.15", "numpy", "protobuf>=3.19.0"]
 [tool.scikit-build]
 # Key: simplified CMake path
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -10,14 +10,6 @@ if(APPLE)
    set(OpenMP_C_LIB_NAMES "omp")
    set(OpenMP_CXX_LIB_NAMES "omp")
    set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
    # Force use of system libc++ to avoid version mismatch
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++")
    # Set minimum macOS version for better compatibility
    set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version")
 endif()
 # Use system ZeroMQ instead of building from source
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "leann-backend-hnsw"
-version = "0.1.16"
+version = "0.1.15"
 description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
 dependencies = [
-    "leann-core==0.1.16",
+    "leann-core==0.1.15",
    "numpy",
    "pyzmq>=23.0.0",
    "msgpack>=1.0.0",
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann-core"
-version = "0.1.16"
+version = "0.1.15"
 description = "Core API and plugin system for LEANN"
 readme = "README.md"
 requires-python = ">=3.9"
--- a/packages/leann-core/src/leann/init.py
+++ b/packages/leann-core/src/leann/init.py
@@ -8,10 +8,6 @@ if platform.system() == "Darwin":
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    os.environ["KMP_BLOCKTIME"] = "0"
    # Additional fixes for PyTorch/sentence-transformers on macOS ARM64 only in CI
    if os.environ.get("CI") == "true":
        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from .api import LeannBuilder, LeannChat, LeannSearcher
 from .registry import BACKEND_REGISTRY, autodiscover_backends
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -23,11 +23,6 @@ from .registry import BACKEND_REGISTRY
 logger = logging.getLogger(__name__)
 def get_registered_backends() -> list[str]:
    """Get list of registered backend names."""
    return list(BACKEND_REGISTRY.keys())
 def compute_embeddings(
    chunks: list[str],
    model_name: str,
--- a/packages/leann-core/src/leann/chat.py
+++ b/packages/leann-core/src/leann/chat.py
@@ -542,41 +542,14 @@ class HFChat(LLMInterface):
            self.device = "cpu"
            logger.info("No GPU detected. Using CPU.")
-        # Load tokenizer and model with timeout protection
+        # Load tokenizer and model
-        try:
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            import signal
+        self.model = AutoModelForCausalLM.from_pretrained(
-
+            model_name,
-            def timeout_handler(signum, frame):
+            torch_dtype=torch.float16 if self.device != "cpu" else torch.float32,
-                raise TimeoutError("Model download/loading timed out")
+            device_map="auto" if self.device != "cpu" else None,
-
+            trust_remote_code=True,
-            # Set timeout for model loading (60 seconds)
+        )
            old_handler = signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(60)
            try:
                logger.info(f"Loading tokenizer for {model_name}...")
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                logger.info(f"Loading model {model_name}...")
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if self.device != "cpu" else torch.float32,
                    device_map="auto" if self.device != "cpu" else None,
                    trust_remote_code=True,
                )
                logger.info(f"Successfully loaded {model_name}")
            finally:
                signal.alarm(0)  # Cancel the alarm
                signal.signal(signal.SIGALRM, old_handler)  # Restore old handler
        except TimeoutError:
            logger.error(f"Model loading timed out for {model_name}")
            raise RuntimeError(
                f"Model loading timed out for {model_name}. Please check your internet connection or try a smaller model."
            )
        except Exception as e:
            logger.error(f"Failed to load model {model_name}: {e}")
            raise
        # Move model to device if not using device_map
        if self.device != "cpu" and "device_map" not in str(self.model):
--- a/packages/leann-core/src/leann/embedding_server_manager.py
+++ b/packages/leann-core/src/leann/embedding_server_manager.py
@@ -354,21 +354,13 @@ class EmbeddingServerManager:
        self.server_process.terminate()
        try:
-            self.server_process.wait(timeout=3)
+            self.server_process.wait(timeout=5)
            logger.info(f"Server process {self.server_process.pid} terminated.")
        except subprocess.TimeoutExpired:
            logger.warning(
-                f"Server process {self.server_process.pid} did not terminate gracefully within 3 seconds, killing it."
+                f"Server process {self.server_process.pid} did not terminate gracefully, killing it."
            )
            self.server_process.kill()
            try:
                self.server_process.wait(timeout=2)
                logger.info(f"Server process {self.server_process.pid} killed successfully.")
            except subprocess.TimeoutExpired:
                logger.error(
                    f"Failed to kill server process {self.server_process.pid} - it may be hung"
                )
                # Don't hang indefinitely
        # Clean up process resources to prevent resource tracker warnings
        try:
--- a/packages/leann/README.md
+++ b/packages/leann/README.md
@@ -5,8 +5,11 @@ LEANN is a revolutionary vector database that democratizes personal AI. Transfor
 ## Installation
 ```bash
-# Default installation (includes both HNSW and DiskANN backends)
+# Default installation (HNSW backend, recommended)
 uv pip install leann
 # With DiskANN backend (for large-scale deployments)
 uv pip install leann[diskann]
 ```
 ## Quick Start
@@ -16,8 +19,8 @@ from leann import LeannBuilder, LeannSearcher, LeannChat
 from pathlib import Path
 INDEX_PATH = str(Path("./").resolve() / "demo.leann")
-# Build an index (choose backend: "hnsw" or "diskann")
+# Build an index
-builder = LeannBuilder(backend_name="hnsw")  # or "diskann" for large-scale deployments
+builder = LeannBuilder(backend_name="hnsw")
 builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
 builder.add_text("Tung Tung Tung Sahur called—they need their banana‑crocodile hybrid back")
 builder.build_index(INDEX_PATH)
--- a/packages/leann/pyproject.toml
+++ b/packages/leann/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann"
-version = "0.1.16"
+version = "0.1.15"
 description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -24,15 +24,16 @@ classifiers = [
    "Programming Language :: Python :: 3.12",
 ]
-# Default installation: core + hnsw + diskann
+# Default installation: core + hnsw
 dependencies = [
    "leann-core>=0.1.0",
    "leann-backend-hnsw>=0.1.0",
    "leann-backend-diskann>=0.1.0",
 ]
 [project.optional-dependencies]
-# All backends now included by default
+diskann = [
    "leann-backend-diskann>=0.1.0",
 ]
 [project.urls]
 Repository = "https://github.com/yichuan-w/LEANN"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "leann-workspace"
 version = "0.1.0"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
    "leann-core",
@@ -33,8 +33,8 @@ dependencies = [
    # LlamaIndex core and readers - updated versions
    "llama-index>=0.12.44",
    "llama-index-readers-file>=0.4.0",  # Essential for PDF parsing
-    # "llama-index-readers-docling",  # Requires Python >= 3.10
+    "llama-index-readers-docling",
-    # "llama-index-node-parser-docling",  # Requires Python >= 3.10
+    "llama-index-node-parser-docling",
    "llama-index-vector-stores-faiss>=0.4.0",
    "llama-index-embeddings-huggingface>=0.5.5",
    # Other dependencies
@@ -49,7 +49,6 @@ dependencies = [
 dev = [
    "pytest>=7.0",
    "pytest-cov>=4.0",
    "pytest-xdist>=3.0",  # For parallel test execution
    "black>=23.0",
    "ruff>=0.1.0",
    "matplotlib",
@@ -57,15 +56,6 @@ dev = [
    "pre-commit>=3.5.0",
 ]
 test = [
    "pytest>=7.0",
    "pytest-timeout>=2.0",
    "llama-index-core>=0.12.0",
    "llama-index-readers-file>=0.4.0",
    "python-dotenv>=1.0.0",
    "sentence-transformers>=2.2.0",
 ]
 diskann = [
    "leann-backend-diskann",
 ]
@@ -133,24 +123,3 @@ line-ending = "auto"
 dev = [
    "ruff>=0.12.4",
 ]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
 markers = [
    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
    "openai: marks tests that require OpenAI API key",
 ]
 timeout = 600
 addopts = [
    "-v",
    "--tb=short",
    "--strict-markers",
    "--disable-warnings",
 ]
 env = [
    "HF_HUB_DISABLE_SYMLINKS=1",
    "TOKENIZERS_PARALLELISM=false",
 ]
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,87 +0,0 @@
 # LEANN Tests
 This directory contains automated tests for the LEANN project using pytest.
 ## Test Files
 ### `test_readme_examples.py`
 Tests the examples shown in README.md:
 - The basic example code that users see first
 - Import statements work correctly
 - Different backend options (HNSW, DiskANN)
 - Different LLM configuration options
 ### `test_basic.py`
 Basic functionality tests that verify:
 - All packages can be imported correctly
 - C++ extensions (FAISS, DiskANN) load properly
 - Basic index building and searching works for both HNSW and DiskANN backends
 - Uses parametrized tests to test both backends
 ### `test_main_cli.py`
 Tests the main CLI example functionality:
 - Tests with facebook/contriever embeddings
 - Tests with OpenAI embeddings (if API key is available)
 - Tests error handling with invalid parameters
 - Verifies that normalized embeddings are detected and cosine distance is used
 ## Running Tests
 ### Install test dependencies:
 ```bash
 # Using extras
 uv pip install -e ".[test]"
 ```
 ### Run all tests:
 ```bash
 pytest tests/
 # Or with coverage
 pytest tests/ --cov=leann --cov-report=html
 # Run in parallel (faster)
 pytest tests/ -n auto
 ```
 ### Run specific tests:
 ```bash
 # Only basic tests
 pytest tests/test_basic.py
 # Only tests that don't require OpenAI
 pytest tests/ -m "not openai"
 # Skip slow tests
 pytest tests/ -m "not slow"
 ```
 ### Run with specific backend:
 ```bash
 # Test only HNSW backend
 pytest tests/test_basic.py::test_backend_basic[hnsw]
 # Test only DiskANN backend
 pytest tests/test_basic.py::test_backend_basic[diskann]
 ```
 ## CI/CD Integration
 Tests are automatically run in GitHub Actions:
 1. After building wheel packages
 2. On multiple Python versions (3.9 - 3.13)
 3. On both Ubuntu and macOS
 4. Using pytest with appropriate markers and flags
 ### pytest.ini Configuration
 The `pytest.ini` file configures:
 - Test discovery paths
 - Default timeout (600 seconds)
 - Environment variables (HF_HUB_DISABLE_SYMLINKS, TOKENIZERS_PARALLELISM)
 - Custom markers for slow and OpenAI tests
 - Verbose output with short tracebacks
 ### Known Issues
 - OpenAI tests are automatically skipped if no API key is provided
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,92 +0,0 @@
 """
 Basic functionality tests for CI pipeline using pytest.
 """
 import os
 import tempfile
 from pathlib import Path
 import pytest
 def test_imports():
    """Test that all packages can be imported."""
    # Test C++ extensions
@pytest.mark.skipif(
    os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
 )
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
 def test_backend_basic(backend_name):
    """Test basic functionality for each backend."""
    from leann.api import LeannBuilder, LeannSearcher, SearchResult
    # Create temporary directory for index
    with tempfile.TemporaryDirectory() as temp_dir:
        index_path = str(Path(temp_dir) / f"test.{backend_name}")
        # Test with small data
        texts = [f"This is document {i} about topic {i % 5}" for i in range(100)]
        # Configure builder based on backend
        if backend_name == "hnsw":
            builder = LeannBuilder(
                backend_name="hnsw",
                embedding_model="facebook/contriever",
                embedding_mode="sentence-transformers",
                M=16,
                efConstruction=200,
            )
        else:  # diskann
            builder = LeannBuilder(
                backend_name="diskann",
                embedding_model="facebook/contriever",
                embedding_mode="sentence-transformers",
                num_neighbors=32,
                search_list_size=50,
            )
        # Add texts
        for text in texts:
            builder.add_text(text)
        # Build index
        builder.build_index(index_path)
        # Test search
        searcher = LeannSearcher(index_path)
        results = searcher.search("document about topic 2", top_k=5)
        # Verify results
        assert len(results) > 0
        assert isinstance(results[0], SearchResult)
        assert "topic 2" in results[0].text or "document" in results[0].text
@pytest.mark.skipif(
    os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
 )
 def test_large_index():
    """Test with larger dataset."""
    from leann.api import LeannBuilder, LeannSearcher
    with tempfile.TemporaryDirectory() as temp_dir:
        index_path = str(Path(temp_dir) / "test_large.hnsw")
        texts = [f"Document {i}: {' '.join([f'word{j}' for j in range(50)])}" for i in range(1000)]
        builder = LeannBuilder(
            backend_name="hnsw",
            embedding_model="facebook/contriever",
            embedding_mode="sentence-transformers",
        )
        for text in texts:
            builder.add_text(text)
        builder.build_index(index_path)
        searcher = LeannSearcher(index_path)
        results = searcher.search(["word10 word20"], top_k=10)
        assert len(results[0]) == 10
--- a/tests/test_ci_minimal.py
+++ b/tests/test_ci_minimal.py
@@ -1,49 +0,0 @@
 """
 Minimal tests for CI that don't require model loading or significant memory.
 """
 import subprocess
 import sys
 def test_package_imports():
    """Test that all core packages can be imported."""
    # Core package
    # Backend packages
    # Core modules
    assert True  # If we get here, imports worked
 def test_cli_help():
    """Test that CLI example shows help."""
    result = subprocess.run(
        [sys.executable, "examples/main_cli_example.py", "--help"], capture_output=True, text=True
    )
    assert result.returncode == 0
    assert "usage:" in result.stdout.lower() or "usage:" in result.stderr.lower()
    assert "--llm" in result.stdout or "--llm" in result.stderr
 def test_backend_registration():
    """Test that backends are properly registered."""
    from leann.api import get_registered_backends
    backends = get_registered_backends()
    assert "hnsw" in backends
    assert "diskann" in backends
 def test_version_info():
    """Test that packages have version information."""
    import leann
    import leann_backend_diskann
    import leann_backend_hnsw
    # Check that packages have __version__ or can be imported
    assert hasattr(leann, "__version__") or True
    assert hasattr(leann_backend_hnsw, "__version__") or True
    assert hasattr(leann_backend_diskann, "__version__") or True
--- a/tests/test_main_cli.py
+++ b/tests/test_main_cli.py
@@ -1,120 +0,0 @@
 """
 Test main_cli_example functionality using pytest.
 """
 import os
 import subprocess
 import sys
 import tempfile
 from pathlib import Path
 import pytest
@pytest.fixture
 def test_data_dir():
    """Return the path to test data directory."""
    return Path("examples/data")
@pytest.mark.skipif(
    os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
 )
 def test_main_cli_simulated(test_data_dir):
    """Test main_cli with simulated LLM."""
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use a subdirectory that doesn't exist yet to force index creation
        index_dir = Path(temp_dir) / "test_index"
        cmd = [
            sys.executable,
            "examples/main_cli_example.py",
            "--llm",
            "simulated",
            "--embedding-model",
            "facebook/contriever",
            "--embedding-mode",
            "sentence-transformers",
            "--index-dir",
            str(index_dir),
            "--data-dir",
            str(test_data_dir),
            "--query",
            "What is Pride and Prejudice about?",
        ]
        env = os.environ.copy()
        env["HF_HUB_DISABLE_SYMLINKS"] = "1"
        env["TOKENIZERS_PARALLELISM"] = "false"
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
        # Check return code
        assert result.returncode == 0, f"Command failed: {result.stderr}"
        # Verify output
        output = result.stdout + result.stderr
        assert "Leann index built at" in output or "Using existing index" in output
        assert "This is a simulated answer" in output
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available")
 def test_main_cli_openai(test_data_dir):
    """Test main_cli with OpenAI embeddings."""
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use a subdirectory that doesn't exist yet to force index creation
        index_dir = Path(temp_dir) / "test_index_openai"
        cmd = [
            sys.executable,
            "examples/main_cli_example.py",
            "--llm",
            "simulated",  # Use simulated LLM to avoid GPT-4 costs
            "--embedding-model",
            "text-embedding-3-small",
            "--embedding-mode",
            "openai",
            "--index-dir",
            str(index_dir),
            "--data-dir",
            str(test_data_dir),
            "--query",
            "What is Pride and Prejudice about?",
        ]
        env = os.environ.copy()
        env["TOKENIZERS_PARALLELISM"] = "false"
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
        assert result.returncode == 0, f"Command failed: {result.stderr}"
        # Verify cosine distance was used
        output = result.stdout + result.stderr
        assert any(
            msg in output
            for msg in [
                "distance_metric='cosine'",
                "Automatically setting distance_metric='cosine'",
                "Using cosine distance",
            ]
        )
 def test_main_cli_error_handling(test_data_dir):
    """Test main_cli with invalid parameters."""
    with tempfile.TemporaryDirectory() as temp_dir:
        cmd = [
            sys.executable,
            "examples/main_cli_example.py",
            "--llm",
            "invalid_llm_type",
            "--index-dir",
            temp_dir,
            "--data-dir",
            str(test_data_dir),
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        # Should fail with invalid LLM type
        assert result.returncode != 0
        assert "Unknown LLM type" in result.stderr or "invalid_llm_type" in result.stderr
--- a/tests/test_readme_examples.py
+++ b/tests/test_readme_examples.py
@@ -1,165 +0,0 @@
 """
 Test examples from README.md to ensure documentation is accurate.
 """
 import os
 import platform
 import tempfile
 from pathlib import Path
 import pytest
 def test_readme_basic_example():
    """Test the basic example from README.md."""
    # Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
    if os.environ.get("CI") == "true" and platform.system() == "Darwin":
        pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
    # This is the exact code from README (with smaller model for CI)
    from leann import LeannBuilder, LeannChat, LeannSearcher
    from leann.api import SearchResult
    with tempfile.TemporaryDirectory() as temp_dir:
        INDEX_PATH = str(Path(temp_dir) / "demo.leann")
        # Build an index
        # In CI, use a smaller model to avoid memory issues
        if os.environ.get("CI") == "true":
            builder = LeannBuilder(
                backend_name="hnsw",
                embedding_model="sentence-transformers/all-MiniLM-L6-v2",  # Smaller model
                dimensions=384,  # Smaller dimensions
            )
        else:
            builder = LeannBuilder(backend_name="hnsw")
        builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
        builder.add_text("Tung Tung Tung Sahur called—they need their banana-crocodile hybrid back")
        builder.build_index(INDEX_PATH)
        # Verify index was created
        # The index path should be a directory containing index files
        index_dir = Path(INDEX_PATH).parent
        assert index_dir.exists()
        # Check that index files were created
        index_files = list(index_dir.glob(f"{Path(INDEX_PATH).stem}.*"))
        assert len(index_files) > 0
        # Search
        searcher = LeannSearcher(INDEX_PATH)
        results = searcher.search("fantastical AI-generated creatures", top_k=1)
        # Verify search results
        assert len(results) > 0
        assert isinstance(results[0], SearchResult)
        # The second text about banana-crocodile should be more relevant
        assert "banana" in results[0].text or "crocodile" in results[0].text
        # Chat with your data (using simulated LLM to avoid external dependencies)
        chat = LeannChat(INDEX_PATH, llm_config={"type": "simulated"})
        response = chat.ask("How much storage does LEANN save?", top_k=1)
        # Verify chat works
        assert isinstance(response, str)
        assert len(response) > 0
 def test_readme_imports():
    """Test that the imports shown in README work correctly."""
    # These are the imports shown in README
    from leann import LeannBuilder, LeannChat, LeannSearcher
    # Verify they are the correct types
    assert callable(LeannBuilder)
    assert callable(LeannSearcher)
    assert callable(LeannChat)
 def test_backend_options():
    """Test different backend options mentioned in documentation."""
    # Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
    if os.environ.get("CI") == "true" and platform.system() == "Darwin":
        pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
    from leann import LeannBuilder
    with tempfile.TemporaryDirectory() as temp_dir:
        # Use smaller model in CI to avoid memory issues
        if os.environ.get("CI") == "true":
            model_args = {
                "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
                "dimensions": 384,
            }
        else:
            model_args = {}
        # Test HNSW backend (as shown in README)
        hnsw_path = str(Path(temp_dir) / "test_hnsw.leann")
        builder_hnsw = LeannBuilder(backend_name="hnsw", **model_args)
        builder_hnsw.add_text("Test document for HNSW backend")
        builder_hnsw.build_index(hnsw_path)
        assert Path(hnsw_path).parent.exists()
        assert len(list(Path(hnsw_path).parent.glob(f"{Path(hnsw_path).stem}.*"))) > 0
        # Test DiskANN backend (mentioned as available option)
        diskann_path = str(Path(temp_dir) / "test_diskann.leann")
        builder_diskann = LeannBuilder(backend_name="diskann", **model_args)
        builder_diskann.add_text("Test document for DiskANN backend")
        builder_diskann.build_index(diskann_path)
        assert Path(diskann_path).parent.exists()
        assert len(list(Path(diskann_path).parent.glob(f"{Path(diskann_path).stem}.*"))) > 0
 def test_llm_config_simulated():
    """Test simulated LLM configuration option."""
    # Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
    if os.environ.get("CI") == "true" and platform.system() == "Darwin":
        pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
    from leann import LeannBuilder, LeannChat
    with tempfile.TemporaryDirectory() as temp_dir:
        # Build a simple index
        index_path = str(Path(temp_dir) / "test.leann")
        # Use smaller model in CI to avoid memory issues
        if os.environ.get("CI") == "true":
            builder = LeannBuilder(
                backend_name="hnsw",
                embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                dimensions=384,
            )
        else:
            builder = LeannBuilder(backend_name="hnsw")
        builder.add_text("Test document for LLM testing")
        builder.build_index(index_path)
        # Test simulated LLM config
        llm_config = {"type": "simulated"}
        chat = LeannChat(index_path, llm_config=llm_config)
        response = chat.ask("What is this document about?", top_k=1)
        assert isinstance(response, str)
        assert len(response) > 0
@pytest.mark.skip(reason="Requires HF model download and may timeout")
 def test_llm_config_hf():
    """Test HuggingFace LLM configuration option."""
    from leann import LeannBuilder, LeannChat
    pytest.importorskip("transformers")  # Skip if transformers not installed
    with tempfile.TemporaryDirectory() as temp_dir:
        # Build a simple index
        index_path = str(Path(temp_dir) / "test.leann")
        builder = LeannBuilder(backend_name="hnsw")
        builder.add_text("Test document for LLM testing")
        builder.build_index(index_path)
        # Test HF LLM config
        llm_config = {"type": "hf", "model": "Qwen/Qwen3-0.6B"}
        chat = LeannChat(index_path, llm_config=llm_config)
        response = chat.ask("What is this document about?", top_k=1)
        assert isinstance(response, str)
        assert len(response) > 0
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Andy Lee	4b4b825fec	Merge remote-tracking branch 'origin/main' into fix/openai-embeddings-cosine-distance	2025-07-28 10:17:55 -07:00
Andy Lee	34ef0db42f	fix: Improve OpenAI embeddings handling in HNSW backend	2025-07-28 10:15:56 -07:00
Andy Lee	41812c7d22	feat: add --use-existing-index option to google_history_reader_leann.py - Allow using existing index without rebuilding - Useful for testing pre-built indices	2025-07-28 00:36:57 -07:00
Andy Lee	2047a1a128	feat: add OpenAI embeddings support to google_history_reader_leann.py - Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings	2025-07-27 23:10:20 -07:00
Andy Lee	402e8f97ad	style: format	2025-07-27 20:25:40 -07:00
Andy Lee	9a5c197acd	fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric.	2025-07-27 20:21:05 -07:00