fix: faster embed

2025-11-24 05:30:11 +00:00
parent 66c6aad3e4
commit 36c44b8806
4 changed files with 110 additions and 95 deletions
--- a/issue_159.py
+++ b/issue_159.py
@@ -9,120 +9,125 @@ Configuration:
 - backend: hnsw
 """

-import time
 import os
+import time
 from pathlib import Path
+
 from leann.api import LeannBuilder, LeannSearcher

+os.environ["LEANN_LOG_LEVEL"] = "DEBUG"
+
 # Configuration matching the issue
 INDEX_PATH = "./test_issue_159.leann"
 EMBEDDING_MODEL = "BAAI/bge-large-zh-v1.5"
 BACKEND_NAME = "hnsw"
-BEAM_WIDTH = 10  # Note: beam_width is mainly for DiskANN, not HNSW
+

 def generate_test_data(num_chunks=90000, chunk_size=2000):
    """Generate test data similar to 180MB text (~90K chunks)"""
    # Each chunk is approximately 2000 characters
    # 90K chunks * 2000 chars ≈ 180MB
    chunks = []
-    base_text = "这是一个测试文档。LEANN是一个创新的向量数据库，通过图基选择性重计算实现97%的存储节省。"
-    
+    base_text = (
+        "这是一个测试文档。LEANN是一个创新的向量数据库，通过图基选择性重计算实现97%的存储节省。"
+    )
+
    for i in range(num_chunks):
        chunk = f"{base_text} 文档编号: {i}. " * (chunk_size // len(base_text) + 1)
        chunks.append(chunk[:chunk_size])
-    
+
    return chunks

+
 def test_search_performance():
    """Test search performance with different configurations"""
    print("=" * 80)
    print("Testing LEANN Search Performance (Issue #159)")
    print("=" * 80)
-    
-    # Check if index exists - skip build if it does
-    index_path = Path(INDEX_PATH)
-    if True:
+
+    meta_path = Path(f"{INDEX_PATH}.meta.json")
+    if meta_path.exists():
        print(f"\n✓ Index already exists at {INDEX_PATH}")
        print("  Skipping build phase. Delete the index to rebuild.")
    else:
-        print(f"\n📦 Building index...")
+        print("\n📦 Building index...")
        print(f"  Backend: {BACKEND_NAME}")
        print(f"  Embedding Model: {EMBEDDING_MODEL}")
-        print(f"  Generating test data (~90K chunks, ~180MB)...")
-        
+        print("  Generating test data (~90K chunks, ~180MB)...")
+
        chunks = generate_test_data(num_chunks=90000)
        print(f"  Generated {len(chunks)} chunks")
-        print(f"  Total text size: {sum(len(c) for c in chunks) / (1024*1024):.2f} MB")
-        
+        print(f"  Total text size: {sum(len(c) for c in chunks) / (1024 * 1024):.2f} MB")
+
        builder = LeannBuilder(
            backend_name=BACKEND_NAME,
            embedding_model=EMBEDDING_MODEL,
        )
-        
-        print(f"  Adding chunks to builder...")
+
+        print("  Adding chunks to builder...")
        start_time = time.time()
        for i, chunk in enumerate(chunks):
            builder.add_text(chunk)
            if (i + 1) % 10000 == 0:
                print(f"    Added {i + 1}/{len(chunks)} chunks...")
-        
-        print(f"  Building index...")
+
+        print("  Building index...")
        build_start = time.time()
        builder.build_index(INDEX_PATH)
        build_time = time.time() - build_start
        print(f"  ✓ Index built in {build_time:.2f} seconds")
-    
+
    # Test search with different complexity values
-    print(f"\n🔍 Testing search performance...")
+    print("\n🔍 Testing search performance...")
    searcher = LeannSearcher(INDEX_PATH)
-    
+
    test_query = "LEANN向量数据库存储优化"
-    
+
    # Test with default complexity (64)
-    print(f"\n  Test 1: Default complexity (64) `1 ")
+    print("\n  Test 1: Default complexity (64) `1 ")
    print(f"    Query: '{test_query}'")
    start_time = time.time()
-    results = searcher.search(test_query, top_k=10, complexity=64, beam_width=BEAM_WIDTH)
+    results = searcher.search(test_query, top_k=10, complexity=64)
    search_time = time.time() - start_time
    print(f"    ✓ Search completed in {search_time:.2f} seconds")
    print(f"    Results: {len(results)} items")
-    
+
    # Test with default complexity (64)
-    print(f"\n  Test 1: Default complexity (64)")
+    print("\n  Test 1: Default complexity (64)")
    print(f"    Query: '{test_query}'")
    start_time = time.time()
-    results = searcher.search(test_query, top_k=10, complexity=64, beam_width=BEAM_WIDTH)
+    results = searcher.search(test_query, top_k=10, complexity=64)
    search_time = time.time() - start_time
    print(f"    ✓ Search completed in {search_time:.2f} seconds")
    print(f"    Results: {len(results)} items")
-    
+
    # Test with lower complexity (32)
-    print(f"\n  Test 2: Lower complexity (32)")
+    print("\n  Test 2: Lower complexity (32)")
    print(f"    Query: '{test_query}'")
    start_time = time.time()
-    results = searcher.search(test_query, top_k=10, complexity=32, beam_width=BEAM_WIDTH)
+    results = searcher.search(test_query, top_k=10, complexity=32)
    search_time = time.time() - start_time
    print(f"    ✓ Search completed in {search_time:.2f} seconds")
    print(f"    Results: {len(results)} items")
-    
+
    # Test with even lower complexity (16)
-    print(f"\n  Test 3: Lower complexity (16)")
+    print("\n  Test 3: Lower complexity (16)")
    print(f"    Query: '{test_query}'")
    start_time = time.time()
-    results = searcher.search(test_query, top_k=10, complexity=16, beam_width=BEAM_WIDTH)
+    results = searcher.search(test_query, top_k=10, complexity=16)
    search_time = time.time() - start_time
    print(f"    ✓ Search completed in {search_time:.2f} seconds")
    print(f"    Results: {len(results)} items")
-    
+
    # Test with minimal complexity (8)
-    print(f"\n  Test 4: Minimal complexity (8)")
+    print("\n  Test 4: Minimal complexity (8)")
    print(f"    Query: '{test_query}'")
    start_time = time.time()
-    results = searcher.search(test_query, top_k=10, complexity=8, beam_width=BEAM_WIDTH)
+    results = searcher.search(test_query, top_k=10, complexity=8)
    search_time = time.time() - start_time
    print(f"    ✓ Search completed in {search_time:.2f} seconds")
    print(f"    Results: {len(results)} items")
-    
+
    print("\n" + "=" * 80)
    print("Performance Analysis:")
    print("=" * 80)
@@ -139,6 +144,6 @@ def test_search_performance():
    print("- Consider using DiskANN backend for better performance on large datasets")
    print("- Or use a smaller embedding model if speed is critical")

+
 if __name__ == "__main__":
    test_search_performance()
-