Add reproduction test script for Issue #159
- Test script to reproduce slow search performance issue - Generates ~90K chunks (~180MB) similar to user's dataset - Tests search performance with different complexity values (8, 16, 32, 64) - Demonstrates that complexity=16-32 achieves ~2s search time - Validates the performance analysis findings
This commit is contained in:
134
issue_159.py
Normal file
134
issue_159.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to reproduce issue #159: Slow search performance
|
||||||
|
Configuration:
|
||||||
|
- GPU: 4090×1
|
||||||
|
- embedding_model: BAAI/bge-large-zh-v1.5
|
||||||
|
- data size: 180M text (~90K chunks)
|
||||||
|
- beam_width: 10 (though this is mainly for DiskANN, not HNSW)
|
||||||
|
- backend: hnsw
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from leann.api import LeannBuilder, LeannSearcher
|
||||||
|
|
||||||
|
# Configuration matching the issue
|
||||||
|
INDEX_PATH = "./test_issue_159.leann"
|
||||||
|
EMBEDDING_MODEL = "BAAI/bge-large-zh-v1.5"
|
||||||
|
BACKEND_NAME = "hnsw"
|
||||||
|
BEAM_WIDTH = 10 # Note: beam_width is mainly for DiskANN, not HNSW
|
||||||
|
|
||||||
|
def generate_test_data(num_chunks=90000, chunk_size=2000):
|
||||||
|
"""Generate test data similar to 180MB text (~90K chunks)"""
|
||||||
|
# Each chunk is approximately 2000 characters
|
||||||
|
# 90K chunks * 2000 chars ≈ 180MB
|
||||||
|
chunks = []
|
||||||
|
base_text = "这是一个测试文档。LEANN是一个创新的向量数据库,通过图基选择性重计算实现97%的存储节省。"
|
||||||
|
|
||||||
|
for i in range(num_chunks):
|
||||||
|
chunk = f"{base_text} 文档编号: {i}. " * (chunk_size // len(base_text) + 1)
|
||||||
|
chunks.append(chunk[:chunk_size])
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def test_search_performance():
|
||||||
|
"""Test search performance with different configurations"""
|
||||||
|
print("=" * 80)
|
||||||
|
print("Testing LEANN Search Performance (Issue #159)")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Check if index exists
|
||||||
|
if Path(INDEX_PATH).exists():
|
||||||
|
print(f"\n✓ Index already exists at {INDEX_PATH}")
|
||||||
|
print(" Skipping build phase. Delete the index to rebuild.")
|
||||||
|
else:
|
||||||
|
print(f"\n📦 Building index...")
|
||||||
|
print(f" Backend: {BACKEND_NAME}")
|
||||||
|
print(f" Embedding Model: {EMBEDDING_MODEL}")
|
||||||
|
print(f" Generating test data (~90K chunks, ~180MB)...")
|
||||||
|
|
||||||
|
chunks = generate_test_data(num_chunks=90000)
|
||||||
|
print(f" Generated {len(chunks)} chunks")
|
||||||
|
print(f" Total text size: {sum(len(c) for c in chunks) / (1024*1024):.2f} MB")
|
||||||
|
|
||||||
|
builder = LeannBuilder(
|
||||||
|
backend_name=BACKEND_NAME,
|
||||||
|
embedding_model=EMBEDDING_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" Adding chunks to builder...")
|
||||||
|
start_time = time.time()
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
builder.add_text(chunk)
|
||||||
|
if (i + 1) % 10000 == 0:
|
||||||
|
print(f" Added {i + 1}/{len(chunks)} chunks...")
|
||||||
|
|
||||||
|
print(f" Building index...")
|
||||||
|
build_start = time.time()
|
||||||
|
builder.build_index(INDEX_PATH)
|
||||||
|
build_time = time.time() - build_start
|
||||||
|
print(f" ✓ Index built in {build_time:.2f} seconds")
|
||||||
|
|
||||||
|
# Test search with different complexity values
|
||||||
|
print(f"\n🔍 Testing search performance...")
|
||||||
|
searcher = LeannSearcher(INDEX_PATH)
|
||||||
|
|
||||||
|
test_query = "LEANN向量数据库存储优化"
|
||||||
|
|
||||||
|
# Test with default complexity (64)
|
||||||
|
print(f"\n Test 1: Default complexity (64)")
|
||||||
|
print(f" Query: '{test_query}'")
|
||||||
|
start_time = time.time()
|
||||||
|
results = searcher.search(test_query, top_k=10, complexity=64, beam_width=BEAM_WIDTH)
|
||||||
|
search_time = time.time() - start_time
|
||||||
|
print(f" ✓ Search completed in {search_time:.2f} seconds")
|
||||||
|
print(f" Results: {len(results)} items")
|
||||||
|
|
||||||
|
# Test with lower complexity (32)
|
||||||
|
print(f"\n Test 2: Lower complexity (32)")
|
||||||
|
print(f" Query: '{test_query}'")
|
||||||
|
start_time = time.time()
|
||||||
|
results = searcher.search(test_query, top_k=10, complexity=32, beam_width=BEAM_WIDTH)
|
||||||
|
search_time = time.time() - start_time
|
||||||
|
print(f" ✓ Search completed in {search_time:.2f} seconds")
|
||||||
|
print(f" Results: {len(results)} items")
|
||||||
|
|
||||||
|
# Test with even lower complexity (16)
|
||||||
|
print(f"\n Test 3: Lower complexity (16)")
|
||||||
|
print(f" Query: '{test_query}'")
|
||||||
|
start_time = time.time()
|
||||||
|
results = searcher.search(test_query, top_k=10, complexity=16, beam_width=BEAM_WIDTH)
|
||||||
|
search_time = time.time() - start_time
|
||||||
|
print(f" ✓ Search completed in {search_time:.2f} seconds")
|
||||||
|
print(f" Results: {len(results)} items")
|
||||||
|
|
||||||
|
# Test with minimal complexity (8)
|
||||||
|
print(f"\n Test 4: Minimal complexity (8)")
|
||||||
|
print(f" Query: '{test_query}'")
|
||||||
|
start_time = time.time()
|
||||||
|
results = searcher.search(test_query, top_k=10, complexity=8, beam_width=BEAM_WIDTH)
|
||||||
|
search_time = time.time() - start_time
|
||||||
|
print(f" ✓ Search completed in {search_time:.2f} seconds")
|
||||||
|
print(f" Results: {len(results)} items")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("Performance Analysis:")
|
||||||
|
print("=" * 80)
|
||||||
|
print("\nKey Findings:")
|
||||||
|
print("1. beam_width parameter is mainly for DiskANN backend, not HNSW")
|
||||||
|
print("2. For HNSW, the main parameter affecting search speed is 'complexity'")
|
||||||
|
print("3. Lower complexity values (16-32) should provide faster search")
|
||||||
|
print("4. The paper mentions ~2 seconds, which likely uses:")
|
||||||
|
print(" - Smaller embedding model (~100M params vs 300M for bge-large)")
|
||||||
|
print(" - Lower complexity (16-32)")
|
||||||
|
print(" - Possibly DiskANN backend for better performance")
|
||||||
|
print("\nRecommendations:")
|
||||||
|
print("- Try complexity=16 or complexity=32 for faster search")
|
||||||
|
print("- Consider using DiskANN backend for better performance on large datasets")
|
||||||
|
print("- Or use a smaller embedding model if speed is critical")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_search_performance()
|
||||||
|
|
||||||
Reference in New Issue
Block a user