fix mem compare

2025-07-14 22:55:10 -07:00
parent f2feccdbd0
commit e5a9ca8787
6 changed files with 147 additions and 12 deletions
--- a/README.md
+++ b/README.md
@@ -196,6 +196,8 @@ LEANN can create a searchable index of your Chrome browser history, allowing you
 <details>
 <summary><strong>📋 Click to expand: Command Examples</strong></summary>

+Note you need to quit google right now to successfully run this.
+
 ```bash
 # Use default Chrome profile (auto-finds all profiles) and recommand method to run this because usually default file is enough
 python examples/google_history_reader_leann.py
--- a/examples/compare_faiss_vs_leann.py
+++ b/examples/compare_faiss_vs_leann.py
@@ -11,6 +11,7 @@ import psutil
 import gc
 import subprocess
 from pathlib import Path
+from llama_index.core.node_parser import SentenceSplitter

 # Setup logging
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -110,6 +111,72 @@ def test_leann_hnsw():

    tracker.checkpoint("After imports")

+    from llama_index.core import SimpleDirectoryReader
+    from leann.api import LeannBuilder, LeannSearcher
+
+
+    # Load and parse documents
+    documents = SimpleDirectoryReader(
+        "examples/data",
+        recursive=True,
+        encoding="utf-8",
+        required_exts=[".pdf", ".txt", ".md"],
+    ).load_data()
+
+    tracker.checkpoint("After document loading")
+
+    # Parse into chunks
+    node_parser = SentenceSplitter(
+        chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
+    )
+
+    all_texts = []
+    for doc in documents:
+        nodes = node_parser.get_nodes_from_documents([doc])
+        for node in nodes:
+            all_texts.append(node.get_content())
+
+    tracker.checkpoint("After text chunking")
+
+    # Build LEANN index
+    INDEX_DIR = Path("./test_leann_comparison")
+    INDEX_PATH = str(INDEX_DIR / "comparison.leann")
+
+    # Check if index already exists
+    if os.path.exists(INDEX_PATH + ".meta.json"):
+        print("Loading existing LEANN HNSW index...")
+        tracker.checkpoint("After loading existing index")
+    else:
+        print("Building new LEANN HNSW index...")
+        # Clean up previous index
+        import shutil
+
+        if INDEX_DIR.exists():
+            shutil.rmtree(INDEX_DIR)
+
+        builder = LeannBuilder(
+            backend_name="hnsw",
+            embedding_model="facebook/contriever",
+            graph_degree=32,
+            complexity=64,
+            is_compact=True,
+            is_recompute=True,
+            num_threads=1,
+        )
+
+        tracker.checkpoint("After builder setup")
+
+        print("Building LEANN HNSW index...")
+
+        for chunk_text in all_texts:
+            builder.add_text(chunk_text)
+
+        builder.build_index(INDEX_PATH)
+        del builder
+        gc.collect()
+
+        tracker.checkpoint("After index building")
+
    # Find existing LEANN index
    index_paths = [
        "./test_leann_comparison/comparison.leann",
@@ -124,10 +191,18 @@ def test_leann_hnsw():
        print("❌ LEANN index not found. Please build it first")
        return {"peak_memory": float("inf"), "error": "Index not found"}

+    # Measure runtime memory overhead
+    print("\nMeasuring runtime memory overhead...")
+    runtime_start_mem = get_memory_usage()
+    print(f"Before load memory: {runtime_start_mem:.1f} MB")
+    tracker.checkpoint("Before load memory")
+    
    # Load searcher
    searcher = LeannSearcher(index_path)
    tracker.checkpoint("After searcher loading")

+
+
    print("Running search queries...")
    queries = [
        "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发",
@@ -143,7 +218,11 @@ def test_leann_hnsw():
        print(f"Query {i + 1} time: {query_time:.3f}s")
        tracker.checkpoint(f"After query {i + 1}")

+    runtime_end_mem = get_memory_usage()
+    runtime_overhead = runtime_end_mem - runtime_start_mem
+
    peak_memory = tracker.summary()
+    print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")

    # Get storage size before cleanup
    storage_size = 0
--- a/examples/faiss_only.py
+++ b/examples/faiss_only.py
@@ -5,6 +5,7 @@ import sys
 import time
 import psutil
 import gc
+import os


 def get_memory_usage():
@@ -44,7 +45,10 @@ def main():
        VectorStoreIndex,
        StorageContext,
        Settings,
+        node_parser,
+        Document,
    )
+    from llama_index.core.node_parser import SentenceSplitter
    from llama_index.vector_stores.faiss import FaissVectorStore
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding

@@ -68,15 +72,63 @@ def main():
    ).load_data()
    tracker.checkpoint("After document loading")

-    print("Building Faiss HNSW index...")
-    vector_store = FaissVectorStore(faiss_index=faiss_index)
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
-    tracker.checkpoint("After index building")
+    # Parse into chunks using the same splitter as LEANN
+    node_parser = SentenceSplitter(
+        chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
+    )

-    index.storage_context.persist("./storage_faiss")
-    tracker.checkpoint("After index saving")
+    all_texts = []
+    for doc in documents:
+        nodes = node_parser.get_nodes_from_documents([doc])
+        for node in nodes:
+            all_texts.append(node.get_content())

+    tracker.checkpoint("After text chunking")
+
+    # Check if index already exists and try to load it
+    index_loaded = False
+    if os.path.exists("./storage_faiss"):
+        print("Loading existing Faiss HNSW index...")
+        try:
+            # Use the correct Faiss loading pattern from the example
+            vector_store = FaissVectorStore.from_persist_dir("./storage_faiss")
+            storage_context = StorageContext.from_defaults(
+                vector_store=vector_store, persist_dir="./storage_faiss"
+            )
+            from llama_index.core import load_index_from_storage
+            index = load_index_from_storage(storage_context=storage_context)
+            print(f"Index loaded from ./storage_faiss")
+            tracker.checkpoint("After loading existing index")
+            index_loaded = True
+        except Exception as e:
+            print(f"Failed to load existing index: {e}")
+            print("Cleaning up corrupted index and building new one...")
+            # Clean up corrupted index
+            import shutil
+            if os.path.exists("./storage_faiss"):
+                shutil.rmtree("./storage_faiss")
+    
+    if not index_loaded:
+        print("Building new Faiss HNSW index...")
+        
+        # Use the correct Faiss building pattern from the example
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        index = VectorStoreIndex.from_documents(
+            documents, storage_context=storage_context
+        )
+        tracker.checkpoint("After index building")
+
+        # Save index to disk using the correct pattern
+        index.storage_context.persist(persist_dir="./storage_faiss")
+        tracker.checkpoint("After index saving")
+
+    # Measure runtime memory overhead
+    print("\nMeasuring runtime memory overhead...")
+    runtime_start_mem = get_memory_usage()
+    print(f"Before load memory: {runtime_start_mem:.1f} MB")
+    tracker.checkpoint("Before load memory")
+    
    query_engine = index.as_query_engine(similarity_top_k=20)
    queries = [
        "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面，任务令一般在什么城市颁发",
@@ -91,8 +143,12 @@ def main():
        print(f"Query {i + 1} time: {query_time:.3f}s")
        tracker.checkpoint(f"After query {i + 1}")

+    runtime_end_mem = get_memory_usage()
+    runtime_overhead = runtime_end_mem - runtime_start_mem
+    
    peak_memory = tracker.summary()
    print(f"Peak Memory: {peak_memory:.1f} MB")
+    print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")


 if __name__ == "__main__":
--- a/examples/google_history_reader_leann.py
+++ b/examples/google_history_reader_leann.py
@@ -199,7 +199,7 @@ async def query_leann_index(index_path: str, query: str):
        query: The query string
    """
    print(f"\n[PHASE 2] Starting Leann chat session...")
-    chat = LeannChat(index_path=index_path, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"})
+    chat = LeannChat(index_path=index_path)
    
    print(f"You: {query}")
    chat_response = chat.ask(
@@ -270,8 +270,6 @@ async def main():
            # Example queries
            queries = [
                "What websites did I visit about machine learning?",
-                "Show me my recent shopping history",
-                "What news sites did I visit this week?",
                "Find my search history about programming"
            ]
            
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
@@ -680,7 +680,7 @@ def create_hnsw_embedding_server(
                                        f"ERROR: Passage ID {nid} not found in passages dict"
                                    )
                                    print(
-                                        f"ERROR: Available passage IDs: {list(passages.keys())[:10]}..."
+                                        f"ERROR: Available passage IDs: {list(passages.keys())}..."
                                    )
                                    raise RuntimeError(
                                        f"FATAL: Passage with ID {nid} not found"
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -290,7 +290,7 @@ class LeannSearcher:
                        )
                    )
                    print(
-                        f"    {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text'][:60]}..."
+                        f"    {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..."
                    )
                except KeyError:
                    print(