fix mem compare
This commit is contained in:
@@ -196,6 +196,8 @@ LEANN can create a searchable index of your Chrome browser history, allowing you
|
||||
<details>
|
||||
<summary><strong>📋 Click to expand: Command Examples</strong></summary>
|
||||
|
||||
Note you need to quit google right now to successfully run this.
|
||||
|
||||
```bash
|
||||
# Use default Chrome profile (auto-finds all profiles) and recommand method to run this because usually default file is enough
|
||||
python examples/google_history_reader_leann.py
|
||||
|
||||
@@ -11,6 +11,7 @@ import psutil
|
||||
import gc
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
@@ -110,6 +111,72 @@ def test_leann_hnsw():
|
||||
|
||||
tracker.checkpoint("After imports")
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
|
||||
# Load and parse documents
|
||||
documents = SimpleDirectoryReader(
|
||||
"examples/data",
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=[".pdf", ".txt", ".md"],
|
||||
).load_data()
|
||||
|
||||
tracker.checkpoint("After document loading")
|
||||
|
||||
# Parse into chunks
|
||||
node_parser = SentenceSplitter(
|
||||
chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
|
||||
)
|
||||
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
nodes = node_parser.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
tracker.checkpoint("After text chunking")
|
||||
|
||||
# Build LEANN index
|
||||
INDEX_DIR = Path("./test_leann_comparison")
|
||||
INDEX_PATH = str(INDEX_DIR / "comparison.leann")
|
||||
|
||||
# Check if index already exists
|
||||
if os.path.exists(INDEX_PATH + ".meta.json"):
|
||||
print("Loading existing LEANN HNSW index...")
|
||||
tracker.checkpoint("After loading existing index")
|
||||
else:
|
||||
print("Building new LEANN HNSW index...")
|
||||
# Clean up previous index
|
||||
import shutil
|
||||
|
||||
if INDEX_DIR.exists():
|
||||
shutil.rmtree(INDEX_DIR)
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1,
|
||||
)
|
||||
|
||||
tracker.checkpoint("After builder setup")
|
||||
|
||||
print("Building LEANN HNSW index...")
|
||||
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
builder.build_index(INDEX_PATH)
|
||||
del builder
|
||||
gc.collect()
|
||||
|
||||
tracker.checkpoint("After index building")
|
||||
|
||||
# Find existing LEANN index
|
||||
index_paths = [
|
||||
"./test_leann_comparison/comparison.leann",
|
||||
@@ -124,10 +191,18 @@ def test_leann_hnsw():
|
||||
print("❌ LEANN index not found. Please build it first")
|
||||
return {"peak_memory": float("inf"), "error": "Index not found"}
|
||||
|
||||
# Measure runtime memory overhead
|
||||
print("\nMeasuring runtime memory overhead...")
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
# Load searcher
|
||||
searcher = LeannSearcher(index_path)
|
||||
tracker.checkpoint("After searcher loading")
|
||||
|
||||
|
||||
|
||||
print("Running search queries...")
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
@@ -143,7 +218,11 @@ def test_leann_hnsw():
|
||||
print(f"Query {i + 1} time: {query_time:.3f}s")
|
||||
tracker.checkpoint(f"After query {i + 1}")
|
||||
|
||||
runtime_end_mem = get_memory_usage()
|
||||
runtime_overhead = runtime_end_mem - runtime_start_mem
|
||||
|
||||
peak_memory = tracker.summary()
|
||||
print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
|
||||
|
||||
# Get storage size before cleanup
|
||||
storage_size = 0
|
||||
|
||||
@@ -5,6 +5,7 @@ import sys
|
||||
import time
|
||||
import psutil
|
||||
import gc
|
||||
import os
|
||||
|
||||
|
||||
def get_memory_usage():
|
||||
@@ -44,7 +45,10 @@ def main():
|
||||
VectorStoreIndex,
|
||||
StorageContext,
|
||||
Settings,
|
||||
node_parser,
|
||||
Document,
|
||||
)
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
|
||||
@@ -68,15 +72,63 @@ def main():
|
||||
).load_data()
|
||||
tracker.checkpoint("After document loading")
|
||||
|
||||
print("Building Faiss HNSW index...")
|
||||
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
|
||||
tracker.checkpoint("After index building")
|
||||
# Parse into chunks using the same splitter as LEANN
|
||||
node_parser = SentenceSplitter(
|
||||
chunk_size=256, chunk_overlap=20, separator=" ", paragraph_separator="\n\n"
|
||||
)
|
||||
|
||||
index.storage_context.persist("./storage_faiss")
|
||||
tracker.checkpoint("After index saving")
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
nodes = node_parser.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
tracker.checkpoint("After text chunking")
|
||||
|
||||
# Check if index already exists and try to load it
|
||||
index_loaded = False
|
||||
if os.path.exists("./storage_faiss"):
|
||||
print("Loading existing Faiss HNSW index...")
|
||||
try:
|
||||
# Use the correct Faiss loading pattern from the example
|
||||
vector_store = FaissVectorStore.from_persist_dir("./storage_faiss")
|
||||
storage_context = StorageContext.from_defaults(
|
||||
vector_store=vector_store, persist_dir="./storage_faiss"
|
||||
)
|
||||
from llama_index.core import load_index_from_storage
|
||||
index = load_index_from_storage(storage_context=storage_context)
|
||||
print(f"Index loaded from ./storage_faiss")
|
||||
tracker.checkpoint("After loading existing index")
|
||||
index_loaded = True
|
||||
except Exception as e:
|
||||
print(f"Failed to load existing index: {e}")
|
||||
print("Cleaning up corrupted index and building new one...")
|
||||
# Clean up corrupted index
|
||||
import shutil
|
||||
if os.path.exists("./storage_faiss"):
|
||||
shutil.rmtree("./storage_faiss")
|
||||
|
||||
if not index_loaded:
|
||||
print("Building new Faiss HNSW index...")
|
||||
|
||||
# Use the correct Faiss building pattern from the example
|
||||
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, storage_context=storage_context
|
||||
)
|
||||
tracker.checkpoint("After index building")
|
||||
|
||||
# Save index to disk using the correct pattern
|
||||
index.storage_context.persist(persist_dir="./storage_faiss")
|
||||
tracker.checkpoint("After index saving")
|
||||
|
||||
# Measure runtime memory overhead
|
||||
print("\nMeasuring runtime memory overhead...")
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
query_engine = index.as_query_engine(similarity_top_k=20)
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
@@ -91,8 +143,12 @@ def main():
|
||||
print(f"Query {i + 1} time: {query_time:.3f}s")
|
||||
tracker.checkpoint(f"After query {i + 1}")
|
||||
|
||||
runtime_end_mem = get_memory_usage()
|
||||
runtime_overhead = runtime_end_mem - runtime_start_mem
|
||||
|
||||
peak_memory = tracker.summary()
|
||||
print(f"Peak Memory: {peak_memory:.1f} MB")
|
||||
print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -199,7 +199,7 @@ async def query_leann_index(index_path: str, query: str):
|
||||
query: The query string
|
||||
"""
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path, llm_config={"type": "hf", "model": "Qwen/Qwen3-0.6B"})
|
||||
chat = LeannChat(index_path=index_path)
|
||||
|
||||
print(f"You: {query}")
|
||||
chat_response = chat.ask(
|
||||
@@ -270,8 +270,6 @@ async def main():
|
||||
# Example queries
|
||||
queries = [
|
||||
"What websites did I visit about machine learning?",
|
||||
"Show me my recent shopping history",
|
||||
"What news sites did I visit this week?",
|
||||
"Find my search history about programming"
|
||||
]
|
||||
|
||||
|
||||
@@ -680,7 +680,7 @@ def create_hnsw_embedding_server(
|
||||
f"ERROR: Passage ID {nid} not found in passages dict"
|
||||
)
|
||||
print(
|
||||
f"ERROR: Available passage IDs: {list(passages.keys())[:10]}..."
|
||||
f"ERROR: Available passage IDs: {list(passages.keys())}..."
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"FATAL: Passage with ID {nid} not found"
|
||||
|
||||
@@ -290,7 +290,7 @@ class LeannSearcher:
|
||||
)
|
||||
)
|
||||
print(
|
||||
f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text'][:60]}..."
|
||||
f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..."
|
||||
)
|
||||
except KeyError:
|
||||
print(
|
||||
|
||||
Reference in New Issue
Block a user