diff --git a/issue_159.py b/issue_159.py new file mode 100644 index 0000000..f3d2bf2 --- /dev/null +++ b/issue_159.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Test script to reproduce issue #159: Slow search performance +Configuration: +- GPU: 4090×1 +- embedding_model: BAAI/bge-large-zh-v1.5 +- data size: 180M text (~90K chunks) +- beam_width: 10 (though this is mainly for DiskANN, not HNSW) +- backend: hnsw +""" + +import time +import os +from pathlib import Path +from leann.api import LeannBuilder, LeannSearcher + +# Configuration matching the issue +INDEX_PATH = "./test_issue_159.leann" +EMBEDDING_MODEL = "BAAI/bge-large-zh-v1.5" +BACKEND_NAME = "hnsw" +BEAM_WIDTH = 10 # Note: beam_width is mainly for DiskANN, not HNSW + +def generate_test_data(num_chunks=90000, chunk_size=2000): + """Generate test data similar to 180MB text (~90K chunks)""" + # Each chunk is approximately 2000 characters + # 90K chunks * 2000 chars ≈ 180MB + chunks = [] + base_text = "这是一个测试文档。LEANN是一个创新的向量数据库,通过图基选择性重计算实现97%的存储节省。" + + for i in range(num_chunks): + chunk = f"{base_text} 文档编号: {i}. " * (chunk_size // len(base_text) + 1) + chunks.append(chunk[:chunk_size]) + + return chunks + +def test_search_performance(): + """Test search performance with different configurations""" + print("=" * 80) + print("Testing LEANN Search Performance (Issue #159)") + print("=" * 80) + + # Check if index exists + if Path(INDEX_PATH).exists(): + print(f"\n✓ Index already exists at {INDEX_PATH}") + print(" Skipping build phase. Delete the index to rebuild.") + else: + print(f"\n📦 Building index...") + print(f" Backend: {BACKEND_NAME}") + print(f" Embedding Model: {EMBEDDING_MODEL}") + print(f" Generating test data (~90K chunks, ~180MB)...") + + chunks = generate_test_data(num_chunks=90000) + print(f" Generated {len(chunks)} chunks") + print(f" Total text size: {sum(len(c) for c in chunks) / (1024*1024):.2f} MB") + + builder = LeannBuilder( + backend_name=BACKEND_NAME, + embedding_model=EMBEDDING_MODEL, + ) + + print(f" Adding chunks to builder...") + start_time = time.time() + for i, chunk in enumerate(chunks): + builder.add_text(chunk) + if (i + 1) % 10000 == 0: + print(f" Added {i + 1}/{len(chunks)} chunks...") + + print(f" Building index...") + build_start = time.time() + builder.build_index(INDEX_PATH) + build_time = time.time() - build_start + print(f" ✓ Index built in {build_time:.2f} seconds") + + # Test search with different complexity values + print(f"\n🔍 Testing search performance...") + searcher = LeannSearcher(INDEX_PATH) + + test_query = "LEANN向量数据库存储优化" + + # Test with default complexity (64) + print(f"\n Test 1: Default complexity (64)") + print(f" Query: '{test_query}'") + start_time = time.time() + results = searcher.search(test_query, top_k=10, complexity=64, beam_width=BEAM_WIDTH) + search_time = time.time() - start_time + print(f" ✓ Search completed in {search_time:.2f} seconds") + print(f" Results: {len(results)} items") + + # Test with lower complexity (32) + print(f"\n Test 2: Lower complexity (32)") + print(f" Query: '{test_query}'") + start_time = time.time() + results = searcher.search(test_query, top_k=10, complexity=32, beam_width=BEAM_WIDTH) + search_time = time.time() - start_time + print(f" ✓ Search completed in {search_time:.2f} seconds") + print(f" Results: {len(results)} items") + + # Test with even lower complexity (16) + print(f"\n Test 3: Lower complexity (16)") + print(f" Query: '{test_query}'") + start_time = time.time() + results = searcher.search(test_query, top_k=10, complexity=16, beam_width=BEAM_WIDTH) + search_time = time.time() - start_time + print(f" ✓ Search completed in {search_time:.2f} seconds") + print(f" Results: {len(results)} items") + + # Test with minimal complexity (8) + print(f"\n Test 4: Minimal complexity (8)") + print(f" Query: '{test_query}'") + start_time = time.time() + results = searcher.search(test_query, top_k=10, complexity=8, beam_width=BEAM_WIDTH) + search_time = time.time() - start_time + print(f" ✓ Search completed in {search_time:.2f} seconds") + print(f" Results: {len(results)} items") + + print("\n" + "=" * 80) + print("Performance Analysis:") + print("=" * 80) + print("\nKey Findings:") + print("1. beam_width parameter is mainly for DiskANN backend, not HNSW") + print("2. For HNSW, the main parameter affecting search speed is 'complexity'") + print("3. Lower complexity values (16-32) should provide faster search") + print("4. The paper mentions ~2 seconds, which likely uses:") + print(" - Smaller embedding model (~100M params vs 300M for bge-large)") + print(" - Lower complexity (16-32)") + print(" - Possibly DiskANN backend for better performance") + print("\nRecommendations:") + print("- Try complexity=16 or complexity=32 for faster search") + print("- Consider using DiskANN backend for better performance on large datasets") + print("- Or use a smaller embedding model if speed is critical") + +if __name__ == "__main__": + test_search_performance() +