diff --git a/README.md b/README.md index 59d488c..9685480 100755 --- a/README.md +++ b/README.md @@ -196,6 +196,8 @@ LEANN can create a searchable index of your Chrome browser history, allowing you
📋 Click to expand: Command Examples +Note you need to quit google right now to successfully run this. + ```bash # Use default Chrome profile (auto-finds all profiles) and recommand method to run this because usually default file is enough python examples/google_history_reader_leann.py diff --git a/examples/compare_faiss_vs_leann.py b/examples/compare_faiss_vs_leann.py index 8d23c37..2a2a55a 100644 --- a/examples/compare_faiss_vs_leann.py +++ b/examples/compare_faiss_vs_leann.py @@ -11,6 +11,7 @@ import psutil import gc import subprocess from pathlib import Path +from llama_index.core.node_parser import SentenceSplitter # Setup logging logging.basicConfig(stream=sys.stdout, level=logging.INFO) @@ -110,6 +111,72 @@ def test_leann_hnsw(): tracker.checkpoint("After imports") + from llama_index.core import SimpleDirectoryReader + from leann.api import LeannBuilder, LeannSearcher + + + # Load and parse documents + documents = SimpleDirectoryReader( + "examples/data", + recursive=True, + encoding="utf-8", + required_exts=[".pdf", ".txt", ".md"], + ).load_data() + + tracker.checkpoint("After document loading") + + # Parse into chunks + node_parser = SentenceSplitter( + chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n" + ) + + all_texts = [] + for doc in documents: + nodes = node_parser.get_nodes_from_documents([doc]) + for node in nodes: + all_texts.append(node.get_content()) + + tracker.checkpoint("After text chunking") + + # Build LEANN index + INDEX_DIR = Path("./test_leann_comparison") + INDEX_PATH = str(INDEX_DIR / "comparison.leann") + + # Check if index already exists + if os.path.exists(INDEX_PATH + ".meta.json"): + print("Loading existing LEANN HNSW index...") + tracker.checkpoint("After loading existing index") + else: + print("Building new LEANN HNSW index...") + # Clean up previous index + import shutil + + if INDEX_DIR.exists(): + shutil.rmtree(INDEX_DIR) + + builder = LeannBuilder( + backend_name="hnsw", + embedding_model="facebook/contriever", + graph_degree=32, + complexity=64, + is_compact=True, + is_recompute=True, + num_threads=1, + ) + + tracker.checkpoint("After builder setup") + + print("Building LEANN HNSW index...") + + for chunk_text in all_texts: + builder.add_text(chunk_text) + + builder.build_index(INDEX_PATH) + del builder + gc.collect() + + tracker.checkpoint("After index building") + # Find existing LEANN index index_paths = [ "./test_leann_comparison/comparison.leann", @@ -124,10 +191,18 @@ def test_leann_hnsw(): print("❌ LEANN index not found. Please build it first") return {"peak_memory": float("inf"), "error": "Index not found"} + # Measure runtime memory overhead + print("\nMeasuring runtime memory overhead...") + runtime_start_mem = get_memory_usage() + print(f"Before load memory: {runtime_start_mem:.1f} MB") + tracker.checkpoint("Before load memory") + # Load searcher searcher = LeannSearcher(index_path) tracker.checkpoint("After searcher loading") + + print("Running search queries...") queries = [ "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发", @@ -143,7 +218,11 @@ def test_leann_hnsw(): print(f"Query {i + 1} time: {query_time:.3f}s") tracker.checkpoint(f"After query {i + 1}") + runtime_end_mem = get_memory_usage() + runtime_overhead = runtime_end_mem - runtime_start_mem + peak_memory = tracker.summary() + print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB") # Get storage size before cleanup storage_size = 0 diff --git a/examples/faiss_only.py b/examples/faiss_only.py index 483f005..bd2822a 100644 --- a/examples/faiss_only.py +++ b/examples/faiss_only.py @@ -5,6 +5,7 @@ import sys import time import psutil import gc +import os def get_memory_usage(): @@ -44,7 +45,10 @@ def main(): VectorStoreIndex, StorageContext, Settings, + node_parser, + Document, ) + from llama_index.core.node_parser import SentenceSplitter from llama_index.vector_stores.faiss import FaissVectorStore from llama_index.embeddings.huggingface import HuggingFaceEmbedding @@ -68,15 +72,63 @@ def main(): ).load_data() tracker.checkpoint("After document loading") - print("Building Faiss HNSW index...") - vector_store = FaissVectorStore(faiss_index=faiss_index) - storage_context = StorageContext.from_defaults(vector_store=vector_store) - index = VectorStoreIndex.from_documents(documents, storage_context=storage_context) - tracker.checkpoint("After index building") + # Parse into chunks using the same splitter as LEANN + node_parser = SentenceSplitter( + chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n" + ) - index.storage_context.persist("./storage_faiss") - tracker.checkpoint("After index saving") + all_texts = [] + for doc in documents: + nodes = node_parser.get_nodes_from_documents([doc]) + for node in nodes: + all_texts.append(node.get_content()) + tracker.checkpoint("After text chunking") + + # Check if index already exists and try to load it + index_loaded = False + if os.path.exists("./storage_faiss"): + print("Loading existing Faiss HNSW index...") + try: + # Use the correct Faiss loading pattern from the example + vector_store = FaissVectorStore.from_persist_dir("./storage_faiss") + storage_context = StorageContext.from_defaults( + vector_store=vector_store, persist_dir="./storage_faiss" + ) + from llama_index.core import load_index_from_storage + index = load_index_from_storage(storage_context=storage_context) + print(f"Index loaded from ./storage_faiss") + tracker.checkpoint("After loading existing index") + index_loaded = True + except Exception as e: + print(f"Failed to load existing index: {e}") + print("Cleaning up corrupted index and building new one...") + # Clean up corrupted index + import shutil + if os.path.exists("./storage_faiss"): + shutil.rmtree("./storage_faiss") + + if not index_loaded: + print("Building new Faiss HNSW index...") + + # Use the correct Faiss building pattern from the example + vector_store = FaissVectorStore(faiss_index=faiss_index) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + index = VectorStoreIndex.from_documents( + documents, storage_context=storage_context + ) + tracker.checkpoint("After index building") + + # Save index to disk using the correct pattern + index.storage_context.persist(persist_dir="./storage_faiss") + tracker.checkpoint("After index saving") + + # Measure runtime memory overhead + print("\nMeasuring runtime memory overhead...") + runtime_start_mem = get_memory_usage() + print(f"Before load memory: {runtime_start_mem:.1f} MB") + tracker.checkpoint("Before load memory") + query_engine = index.as_query_engine(similarity_top_k=20) queries = [ "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发", @@ -91,8 +143,12 @@ def main(): print(f"Query {i + 1} time: {query_time:.3f}s") tracker.checkpoint(f"After query {i + 1}") + runtime_end_mem = get_memory_usage() + runtime_overhead = runtime_end_mem - runtime_start_mem + peak_memory = tracker.summary() print(f"Peak Memory: {peak_memory:.1f} MB") + print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB") if __name__ == "__main__": diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py index 7342287..c07f2f0 100644 --- a/examples/google_history_reader_leann.py +++ b/examples/google_history_reader_leann.py @@ -199,7 +199,7 @@ async def query_leann_index(index_path: str, query: str): query: The query string """ print(f"\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"}) + chat = LeannChat(index_path=index_path) print(f"You: {query}") chat_response = chat.ask( @@ -270,8 +270,6 @@ async def main(): # Example queries queries = [ "What websites did I visit about machine learning?", - "Show me my recent shopping history", - "What news sites did I visit this week?", "Find my search history about programming" ] diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py index 06a29bb..624b8d3 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py @@ -680,7 +680,7 @@ def create_hnsw_embedding_server( f"ERROR: Passage ID {nid} not found in passages dict" ) print( - f"ERROR: Available passage IDs: {list(passages.keys())[:10]}..." + f"ERROR: Available passage IDs: {list(passages.keys())}..." ) raise RuntimeError( f"FATAL: Passage with ID {nid} not found" diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index a195bf8..e8dca42 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -290,7 +290,7 @@ class LeannSearcher: ) ) print( - f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text'][:60]}..." + f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..." ) except KeyError: print(