LEANN/benchmarks/compare_faiss_vs_leann.py

#!/usr/bin/env python3
"""
Memory comparison between Faiss HNSW and LEANN HNSW backend
"""

import gc
import logging
import os
import subprocess
import sys
import time
from pathlib import Path

import psutil
from llama_index.core.node_parser import SentenceSplitter

# Setup logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)


def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024 / 1024


def print_memory_stats(stage: str, start_mem: float):
    """Print memory statistics"""
    current_mem = get_memory_usage()
    diff = current_mem - start_mem
    print(f"[{stage}] Memory: {current_mem:.1f} MB (+{diff:.1f} MB)")
    return current_mem


class MemoryTracker:
    def __init__(self, name: str):
        self.name = name
        self.start_mem = get_memory_usage()
        self.stages = []

    def checkpoint(self, stage: str):
        current_mem = print_memory_stats(f"{self.name} - {stage}", self.start_mem)
        self.stages.append((stage, current_mem))
        return current_mem

    def summary(self):
        print(f"\n=== {self.name} Memory Summary ===")
        for stage, mem in self.stages:
            print(f"{stage}: {mem:.1f} MB")
        peak_mem = max(mem for _, mem in self.stages)
        print(f"Peak Memory: {peak_mem:.1f} MB")
        print(f"Total Memory Increase: {peak_mem - self.start_mem:.1f} MB")
        return peak_mem


def test_faiss_hnsw():
    """Test Faiss HNSW Vector Store in subprocess"""
    print("\n" + "=" * 50)
    print("TESTING FAISS HNSW VECTOR STORE")
    print("=" * 50)

    try:
        result = subprocess.run(
            [sys.executable, "benchmarks/faiss_only.py"],
            capture_output=True,
            text=True,
            timeout=300,
        )

        print(result.stdout)
        if result.stderr:
            print("Stderr:", result.stderr)

        if result.returncode != 0:
            return {
                "peak_memory": float("inf"),
                "error": f"Process failed with code {result.returncode}",
            }

        # Parse peak memory from output
        lines = result.stdout.split("\n")
        peak_memory = 0.0

        for line in lines:
            if "Peak Memory:" in line:
                peak_memory = float(line.split("Peak Memory:")[1].split("MB")[0].strip())

        return {"peak_memory": peak_memory}

    except Exception as e:
        return {
            "peak_memory": float("inf"),
            "error": str(e),
        }


def test_leann_hnsw():
    """Test LEANN HNSW Search Memory (load existing index)"""
    print("\n" + "=" * 50)
    print("TESTING LEANN HNSW SEARCH MEMORY")
    print("=" * 50)

    tracker = MemoryTracker("LEANN HNSW Search")

    # Import and setup
    tracker.checkpoint("Initial")

    from leann.api import LeannSearcher

    tracker.checkpoint("After imports")

    from leann.api import LeannBuilder
    from llama_index.core import SimpleDirectoryReader

    # Load and parse documents
    documents = SimpleDirectoryReader(
        "data",
        recursive=True,
        encoding="utf-8",
        required_exts=[".pdf", ".txt", ".md"],
    ).load_data()

    tracker.checkpoint("After document loading")

    # Parse into chunks
    node_parser = SentenceSplitter(
        chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
    )

    all_texts = []
    for doc in documents:
        nodes = node_parser.get_nodes_from_documents([doc])
        for node in nodes:
            all_texts.append(node.get_content())
    print(f"Total number of chunks: {len(all_texts)}")

    tracker.checkpoint("After text chunking")

    # Build LEANN index
    INDEX_DIR = Path("./test_leann_comparison")
    INDEX_PATH = str(INDEX_DIR / "comparison.leann")

    # Check if index already exists
    if os.path.exists(INDEX_PATH + ".meta.json"):
        print("Loading existing LEANN HNSW index...")
        tracker.checkpoint("After loading existing index")
    else:
        print("Building new LEANN HNSW index...")
        # Clean up previous index
        import shutil

        if INDEX_DIR.exists():
            shutil.rmtree(INDEX_DIR)

        builder = LeannBuilder(
            backend_name="hnsw",
            embedding_model="facebook/contriever",
            graph_degree=32,
            complexity=64,
            is_compact=True,
            is_recompute=True,
            num_threads=1,
        )

        tracker.checkpoint("After builder setup")

        print("Building LEANN HNSW index...")

        for chunk_text in all_texts:
            builder.add_text(chunk_text)

        builder.build_index(INDEX_PATH)
        del builder
        gc.collect()

        tracker.checkpoint("After index building")

    # Find existing LEANN index
    index_paths = [
        "./test_leann_comparison/comparison.leann",
    ]
    index_path = None
    for path in index_paths:
        if os.path.exists(path + ".meta.json"):
            index_path = path
            break

    if not index_path:
        print("❌ LEANN index not found. Please build it first")
        return {"peak_memory": float("inf"), "error": "Index not found"}

    # Measure runtime memory overhead
    print("\nMeasuring runtime memory overhead...")
    runtime_start_mem = get_memory_usage()
    print(f"Before load memory: {runtime_start_mem:.1f} MB")
    tracker.checkpoint("Before load memory")

    # Load searcher
    searcher = LeannSearcher(index_path)
    tracker.checkpoint("After searcher loading")

    print("Running search queries...")
    queries = [
        "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
        "What is LEANN and how does it work?",
        "华为诺亚方舟实验室的主要研究内容",
    ]

    for i, query in enumerate(queries):
        start_time = time.time()
        # Use same parameters as Faiss: top_k=20, ef=120 (complexity parameter)
        _ = searcher.search(query, top_k=20, ef=120)
        query_time = time.time() - start_time
        print(f"Query {i + 1} time: {query_time:.3f}s")
        tracker.checkpoint(f"After query {i + 1}")

    runtime_end_mem = get_memory_usage()
    runtime_overhead = runtime_end_mem - runtime_start_mem

    peak_memory = tracker.summary()
    print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")

    # Get storage size before cleanup
    storage_size = 0
    INDEX_DIR = Path(index_path).parent
    if INDEX_DIR.exists():
        total_size = 0
        for dirpath, _, filenames in os.walk(str(INDEX_DIR)):
            for filename in filenames:
                # Only count actual index files, skip text data and backups
                if filename.endswith((".old", ".tmp", ".bak", ".jsonl", ".json")):
                    continue
                # Count .index, .idx, .map files (actual index structures)
                if filename.endswith((".index", ".idx", ".map")):
                    filepath = os.path.join(dirpath, filename)
                    total_size += os.path.getsize(filepath)
        storage_size = total_size / (1024 * 1024)  # Convert to MB

    # Clean up
    del searcher
    gc.collect()

    return {
        "peak_memory": peak_memory,
        "storage_size": storage_size,
    }


def main():
    """Run comparison tests"""
    print("Storage + Search Memory Comparison: Faiss HNSW vs LEANN HNSW")
    print("=" * 60)

    # Test Faiss HNSW
    faiss_results = test_faiss_hnsw()

    # Force garbage collection
    gc.collect()
    time.sleep(2)

    # Test LEANN HNSW
    leann_results = test_leann_hnsw()

    # Final comparison
    print("\n" + "=" * 60)
    print("STORAGE + SEARCH MEMORY COMPARISON")
    print("=" * 60)

    # Get storage sizes
    faiss_storage_size = 0
    leann_storage_size = leann_results.get("storage_size", 0)

    # Get Faiss storage size using Python
    if os.path.exists("./storage_faiss"):
        total_size = 0
        for dirpath, _, filenames in os.walk("./storage_faiss"):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                total_size += os.path.getsize(filepath)
        faiss_storage_size = total_size / (1024 * 1024)  # Convert to MB

    print("Faiss HNSW:")
    if "error" in faiss_results:
        print(f"  ❌ Failed: {faiss_results['error']}")
    else:
        print(f"  Search Memory: {faiss_results['peak_memory']:.1f} MB")
        print(f"  Storage Size: {faiss_storage_size:.1f} MB")

    print("\nLEANN HNSW:")
    if "error" in leann_results:
        print(f"  ❌ Failed: {leann_results['error']}")
    else:
        print(f"  Search Memory: {leann_results['peak_memory']:.1f} MB")
        print(f"  Storage Size: {leann_storage_size:.1f} MB")

    # Calculate improvements only if both tests succeeded
    if "error" not in faiss_results and "error" not in leann_results:
        memory_ratio = faiss_results["peak_memory"] / leann_results["peak_memory"]

        print("\nLEANN vs Faiss Performance:")
        memory_saving = faiss_results["peak_memory"] - leann_results["peak_memory"]
        print(f"  Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)")

        # Storage comparison
        if leann_storage_size > faiss_storage_size:
            storage_ratio = leann_storage_size / faiss_storage_size
            print(f"  Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)")
        elif faiss_storage_size > leann_storage_size:
            storage_ratio = faiss_storage_size / leann_storage_size
            print(f"  Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)")
        else:
            print("  Storage Size: similar")
    else:
        if "error" not in leann_results:
            print("\n✅ LEANN HNSW completed successfully!")
            print(f"📊 Search Memory: {leann_results['peak_memory']:.1f} MB")
            print(f"📊 Storage Size: {leann_storage_size:.1f} MB")
        if "error" not in faiss_results:
            print("\n✅ Faiss HNSW completed successfully!")
            print(f"📊 Search Memory: {faiss_results['peak_memory']:.1f} MB")
            print(f"📊 Storage Size: {faiss_storage_size:.1f} MB")


if __name__ == "__main__":
    main()