clean dict
This commit is contained in:
@@ -373,7 +373,7 @@ The script will print the recall and search time for each query, followed by the
|
|||||||
|
|
||||||
| System | DPR(2.1M docs) | RPJ-wiki(60M docs) | Chat history(5K messages) |
|
| System | DPR(2.1M docs) | RPJ-wiki(60M docs) | Chat history(5K messages) |
|
||||||
| --------------------- | ---------------- | ---------------- | ---------------- |
|
| --------------------- | ---------------- | ---------------- | ---------------- |
|
||||||
| Traditional Vector DB | 3.8 GB | 201 GB | **22.8 MB** |
|
| Traditional Vector DB | 3.8 GB | 201 GB | 22.8 MB |
|
||||||
| **LEANN** | **324 MB** | **6 GB** | **0.78 MB** |
|
| **LEANN** | **324 MB** | **6 GB** | **0.78 MB** |
|
||||||
| **Reduction** | **91% smaller** | **97% smaller** | **97% smaller** |
|
| **Reduction** | **91% smaller** | **97% smaller** | **97% smaller** |
|
||||||
|
|
||||||
|
|||||||
BIN
tests/.DS_Store
vendored
BIN
tests/.DS_Store
vendored
Binary file not shown.
@@ -1,107 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
DiskANN 距离函数测试
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
import shutil
|
|
||||||
import time
|
|
||||||
|
|
||||||
# 导入后端包以触发插件注册
|
|
||||||
try:
|
|
||||||
import leann_backend_diskann
|
|
||||||
import leann_backend_hnsw
|
|
||||||
print("INFO: Backend packages imported successfully.")
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"WARNING: Could not import backend packages. Error: {e}")
|
|
||||||
|
|
||||||
# 从 leann-core 导入上层 API
|
|
||||||
from leann.api import LeannBuilder, LeannSearcher
|
|
||||||
|
|
||||||
|
|
||||||
def load_sample_documents():
|
|
||||||
"""创建用于演示的样本文档"""
|
|
||||||
docs = [
|
|
||||||
{"title": "Intro to Python", "content": "Python is a programming language for machine learning"},
|
|
||||||
{"title": "ML Basics", "content": "Machine learning algorithms build intelligent systems"},
|
|
||||||
{"title": "Data Structures", "content": "Data structures like arrays and graphs organize information"},
|
|
||||||
]
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def test_distance_function(distance_func, test_name):
|
|
||||||
"""测试特定距离函数"""
|
|
||||||
print(f"\n=== 测试 {test_name} ({distance_func}) ===")
|
|
||||||
|
|
||||||
INDEX_DIR = Path(f"./test_indices_{distance_func}")
|
|
||||||
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
|
|
||||||
|
|
||||||
if INDEX_DIR.exists():
|
|
||||||
shutil.rmtree(INDEX_DIR)
|
|
||||||
|
|
||||||
# 构建索引
|
|
||||||
print(f"构建索引 (距离函数: {distance_func})...")
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
distance_metric=distance_func,
|
|
||||||
graph_degree=16,
|
|
||||||
complexity=32
|
|
||||||
)
|
|
||||||
|
|
||||||
documents = load_sample_documents()
|
|
||||||
for doc in documents:
|
|
||||||
builder.add_text(doc["content"], metadata=doc)
|
|
||||||
|
|
||||||
try:
|
|
||||||
builder.build_index(INDEX_PATH)
|
|
||||||
print(f"✅ 索引构建成功")
|
|
||||||
|
|
||||||
# 测试搜索
|
|
||||||
searcher = LeannSearcher(INDEX_PATH, distance_metric=distance_func)
|
|
||||||
results = searcher.search("machine learning programming", top_k=2)
|
|
||||||
|
|
||||||
print(f"搜索结果:")
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
print(f" {i+1}. Score: {result['score']:.4f}")
|
|
||||||
print(f" Text: {result['text'][:50]}...")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ 测试失败: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("🔍 DiskANN 距离函数测试")
|
|
||||||
print("=" * 50)
|
|
||||||
|
|
||||||
# 测试不同距离函数
|
|
||||||
distance_tests = [
|
|
||||||
("mips", "Maximum Inner Product Search"),
|
|
||||||
("l2", "L2 Euclidean Distance"),
|
|
||||||
("cosine", "Cosine Similarity")
|
|
||||||
]
|
|
||||||
|
|
||||||
results = {}
|
|
||||||
for distance_func, test_name in distance_tests:
|
|
||||||
try:
|
|
||||||
success = test_distance_function(distance_func, test_name)
|
|
||||||
results[distance_func] = success
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ {distance_func} 测试异常: {e}")
|
|
||||||
results[distance_func] = False
|
|
||||||
|
|
||||||
# 总结
|
|
||||||
print("\n" + "=" * 50)
|
|
||||||
print("📊 测试结果总结:")
|
|
||||||
for distance_func, success in results.items():
|
|
||||||
status = "✅ 通过" if success else "❌ 失败"
|
|
||||||
print(f" {distance_func:10s}: {status}")
|
|
||||||
|
|
||||||
print("\n🎉 测试完成!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,127 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
验证DiskANN L2距离是否真正工作
|
|
||||||
"""
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from pathlib import Path
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
# 导入后端包以触发插件注册
|
|
||||||
try:
|
|
||||||
import leann_backend_diskann
|
|
||||||
print("INFO: Backend packages imported successfully.")
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"WARNING: Could not import backend packages. Error: {e}")
|
|
||||||
|
|
||||||
from leann.api import LeannBuilder, LeannSearcher
|
|
||||||
|
|
||||||
def test_l2_verification():
|
|
||||||
"""验证L2距离是否真正被使用"""
|
|
||||||
print("=== 验证DiskANN L2距离实现 ===")
|
|
||||||
|
|
||||||
INDEX_DIR = Path("./test_l2_verification")
|
|
||||||
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
|
|
||||||
|
|
||||||
if INDEX_DIR.exists():
|
|
||||||
shutil.rmtree(INDEX_DIR)
|
|
||||||
|
|
||||||
# 创建特殊的测试文档,使L2和cosine产生不同结果
|
|
||||||
documents = [
|
|
||||||
"machine learning artificial intelligence", # 文档0
|
|
||||||
"computer programming software development", # 文档1
|
|
||||||
"data science analytics statistics" # 文档2
|
|
||||||
]
|
|
||||||
|
|
||||||
print("构建索引...")
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
distance_metric="l2", # 明确指定L2
|
|
||||||
graph_degree=16,
|
|
||||||
complexity=32
|
|
||||||
)
|
|
||||||
|
|
||||||
for i, doc in enumerate(documents):
|
|
||||||
builder.add_text(doc, metadata={"id": i, "text": doc})
|
|
||||||
|
|
||||||
builder.build_index(INDEX_PATH)
|
|
||||||
print("✅ 索引构建完成")
|
|
||||||
|
|
||||||
# 测试搜索
|
|
||||||
searcher = LeannSearcher(INDEX_PATH, distance_metric="l2")
|
|
||||||
|
|
||||||
# 用一个与文档0非常相似的查询
|
|
||||||
query = "machine learning AI technology"
|
|
||||||
results = searcher.search(query, top_k=3)
|
|
||||||
|
|
||||||
print(f"\n查询: '{query}'")
|
|
||||||
print("L2距离搜索结果:")
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}")
|
|
||||||
print(f" Text: {result['text']}")
|
|
||||||
|
|
||||||
# 现在用cosine重新测试同样的数据
|
|
||||||
print(f"\n--- 用Cosine距离对比测试 ---")
|
|
||||||
|
|
||||||
INDEX_DIR_COS = Path("./test_cosine_verification")
|
|
||||||
INDEX_PATH_COS = str(INDEX_DIR_COS / "documents.diskann")
|
|
||||||
|
|
||||||
if INDEX_DIR_COS.exists():
|
|
||||||
shutil.rmtree(INDEX_DIR_COS)
|
|
||||||
|
|
||||||
builder_cos = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
distance_metric="cosine", # 使用cosine
|
|
||||||
graph_degree=16,
|
|
||||||
complexity=32
|
|
||||||
)
|
|
||||||
|
|
||||||
for i, doc in enumerate(documents):
|
|
||||||
builder_cos.add_text(doc, metadata={"id": i, "text": doc})
|
|
||||||
|
|
||||||
builder_cos.build_index(INDEX_PATH_COS)
|
|
||||||
|
|
||||||
searcher_cos = LeannSearcher(INDEX_PATH_COS, distance_metric="cosine")
|
|
||||||
results_cos = searcher_cos.search(query, top_k=3)
|
|
||||||
|
|
||||||
print("Cosine距离搜索结果:")
|
|
||||||
for i, result in enumerate(results_cos):
|
|
||||||
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}")
|
|
||||||
print(f" Text: {result['text']}")
|
|
||||||
|
|
||||||
# 对比分析
|
|
||||||
print(f"\n--- 结果对比分析 ---")
|
|
||||||
print("L2距离的分数是欧几里得距离平方,越小越相似")
|
|
||||||
print("Cosine距离的分数是余弦相似度的负值,越小越相似")
|
|
||||||
|
|
||||||
l2_top = results[0]
|
|
||||||
cos_top = results_cos[0]
|
|
||||||
|
|
||||||
print(f"L2最佳匹配: ID{l2_top['id']}, Score={l2_top['score']:.6f}")
|
|
||||||
print(f"Cosine最佳匹配: ID{cos_top['id']}, Score={cos_top['score']:.6f}")
|
|
||||||
|
|
||||||
if l2_top['id'] == cos_top['id']:
|
|
||||||
print("✅ 两种距离函数返回相同的最佳匹配")
|
|
||||||
else:
|
|
||||||
print("⚠️ 两种距离函数返回不同的最佳匹配 - 这表明它们确实使用了不同的距离计算")
|
|
||||||
|
|
||||||
# 验证分数范围的合理性
|
|
||||||
l2_scores = [r['score'] for r in results]
|
|
||||||
cos_scores = [r['score'] for r in results_cos]
|
|
||||||
|
|
||||||
print(f"L2分数范围: {min(l2_scores):.6f} 到 {max(l2_scores):.6f}")
|
|
||||||
print(f"Cosine分数范围: {min(cos_scores):.6f} 到 {max(cos_scores):.6f}")
|
|
||||||
|
|
||||||
# L2分数应该是正数,cosine分数应该在-1到0之间(因为是负的相似度)
|
|
||||||
if all(score >= 0 for score in l2_scores):
|
|
||||||
print("✅ L2分数都是正数,符合预期")
|
|
||||||
else:
|
|
||||||
print("❌ L2分数有负数,可能有问题")
|
|
||||||
|
|
||||||
if all(-1 <= score <= 0 for score in cos_scores):
|
|
||||||
print("✅ Cosine分数在合理范围内")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Cosine分数超出预期范围: {cos_scores}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_l2_verification()
|
|
||||||
@@ -1,190 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Sanity check script for Leann DiskANN backend
|
|
||||||
Tests different distance functions and embedding models
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
from pathlib import Path
|
|
||||||
import shutil
|
|
||||||
import time
|
|
||||||
|
|
||||||
# 导入后端包以触发插件注册
|
|
||||||
import sys
|
|
||||||
sys.path.append('packages/leann-core/src')
|
|
||||||
sys.path.append('packages/leann-backend-diskann')
|
|
||||||
sys.path.append('packages/leann-backend-hnsw')
|
|
||||||
|
|
||||||
try:
|
|
||||||
import leann_backend_diskann
|
|
||||||
import leann_backend_hnsw
|
|
||||||
print("INFO: Backend packages imported successfully.")
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"WARNING: Could not import backend packages. Error: {e}")
|
|
||||||
|
|
||||||
# 从 leann-core 导入上层 API
|
|
||||||
from leann.api import LeannBuilder, LeannSearcher
|
|
||||||
|
|
||||||
def test_distance_functions():
|
|
||||||
"""测试不同的距离函数"""
|
|
||||||
print("\n=== 测试不同距离函数 ===")
|
|
||||||
|
|
||||||
# 测试数据
|
|
||||||
documents = [
|
|
||||||
"Machine learning is a powerful technology",
|
|
||||||
"Deep learning uses neural networks",
|
|
||||||
"Artificial intelligence transforms industries"
|
|
||||||
]
|
|
||||||
|
|
||||||
distance_functions = ["mips", "l2", "cosine"]
|
|
||||||
|
|
||||||
for distance_func in distance_functions:
|
|
||||||
print(f"\n[测试 {distance_func} 距离函数]")
|
|
||||||
try:
|
|
||||||
index_path = f"test_indices/test_{distance_func}.diskann"
|
|
||||||
if Path(index_path).parent.exists():
|
|
||||||
shutil.rmtree(Path(index_path).parent)
|
|
||||||
|
|
||||||
# 构建索引
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
distance_metric=distance_func,
|
|
||||||
graph_degree=16,
|
|
||||||
complexity=32
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc in documents:
|
|
||||||
builder.add_text(doc)
|
|
||||||
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# 测试搜索
|
|
||||||
searcher = LeannSearcher(index_path, distance_metric=distance_func)
|
|
||||||
results = searcher.search("neural network technology", top_k=2)
|
|
||||||
|
|
||||||
print(f"✅ {distance_func} 距离函数工作正常")
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
print(f" {i+1}. Score: {result['score']:.4f}, Text: {result['text'][:50]}...")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ {distance_func} 距离函数失败: {e}")
|
|
||||||
|
|
||||||
def test_embedding_models():
|
|
||||||
"""测试不同的embedding模型"""
|
|
||||||
print("\n=== 测试不同Embedding模型 ===")
|
|
||||||
|
|
||||||
documents = ["AI is transforming the world", "Technology advances rapidly"]
|
|
||||||
|
|
||||||
# 测试不同的embedding模型
|
|
||||||
models_to_test = [
|
|
||||||
"sentence-transformers/all-mpnet-base-v2",
|
|
||||||
"sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
# "sentence-transformers/distilbert-base-nli-mean-tokens", # 可能不存在
|
|
||||||
]
|
|
||||||
|
|
||||||
for model_name in models_to_test:
|
|
||||||
print(f"\n[测试 {model_name}]")
|
|
||||||
try:
|
|
||||||
index_path = f"test_indices/test_model.diskann"
|
|
||||||
if Path(index_path).parent.exists():
|
|
||||||
shutil.rmtree(Path(index_path).parent)
|
|
||||||
|
|
||||||
# 构建索引
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
embedding_model=model_name,
|
|
||||||
distance_metric="cosine"
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc in documents:
|
|
||||||
builder.add_text(doc)
|
|
||||||
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# 测试搜索
|
|
||||||
searcher = LeannSearcher(index_path)
|
|
||||||
results = searcher.search("artificial intelligence", top_k=1)
|
|
||||||
|
|
||||||
print(f"✅ {model_name} 模型工作正常")
|
|
||||||
print(f" 结果: {results[0]['text'][:50]}...")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ {model_name} 模型失败: {e}")
|
|
||||||
|
|
||||||
def test_search_correctness():
|
|
||||||
"""验证搜索结果的正确性"""
|
|
||||||
print("\n=== 验证搜索结果正确性 ===")
|
|
||||||
|
|
||||||
# 创建有明确相关性的测试文档
|
|
||||||
documents = [
|
|
||||||
"Python is a programming language used for machine learning", # 与编程相关
|
|
||||||
"Dogs are loyal pets that love to play fetch", # 与动物相关
|
|
||||||
"Machine learning algorithms can predict future trends", # 与ML相关
|
|
||||||
"Cats are independent animals that sleep a lot", # 与动物相关
|
|
||||||
"Deep learning neural networks process complex data" # 与ML相关
|
|
||||||
]
|
|
||||||
|
|
||||||
try:
|
|
||||||
index_path = "test_indices/correctness_test.diskann"
|
|
||||||
if Path(index_path).parent.exists():
|
|
||||||
shutil.rmtree(Path(index_path).parent)
|
|
||||||
|
|
||||||
# 构建索引
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name="diskann",
|
|
||||||
distance_metric="cosine"
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc in documents:
|
|
||||||
builder.add_text(doc)
|
|
||||||
|
|
||||||
builder.build_index(index_path)
|
|
||||||
|
|
||||||
# 测试相关性查询
|
|
||||||
searcher = LeannSearcher(index_path)
|
|
||||||
|
|
||||||
test_queries = [
|
|
||||||
("machine learning programming", [0, 2, 4]), # 应该返回ML相关文档
|
|
||||||
("pet animals behavior", [1, 3]), # 应该返回动物相关文档
|
|
||||||
]
|
|
||||||
|
|
||||||
for query, expected_topics in test_queries:
|
|
||||||
print(f"\n查询: '{query}'")
|
|
||||||
results = searcher.search(query, top_k=3)
|
|
||||||
|
|
||||||
print("搜索结果:")
|
|
||||||
for i, result in enumerate(results):
|
|
||||||
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.4f}")
|
|
||||||
print(f" Text: {result['text'][:60]}...")
|
|
||||||
|
|
||||||
# 简单验证:检查前两个结果是否在预期范围内
|
|
||||||
top_ids = [result['id'] for result in results[:2]]
|
|
||||||
relevant_found = any(id in expected_topics for id in top_ids)
|
|
||||||
|
|
||||||
if relevant_found:
|
|
||||||
print("✅ 搜索结果相关性正确")
|
|
||||||
else:
|
|
||||||
print("⚠️ 搜索结果相关性可能有问题")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ 正确性测试失败: {e}")
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("🔍 Leann DiskANN Sanity Check")
|
|
||||||
print("=" * 50)
|
|
||||||
|
|
||||||
# 清理旧的测试数据
|
|
||||||
if Path("test_indices").exists():
|
|
||||||
shutil.rmtree("test_indices")
|
|
||||||
|
|
||||||
# 运行测试
|
|
||||||
test_distance_functions()
|
|
||||||
test_embedding_models()
|
|
||||||
test_search_correctness()
|
|
||||||
|
|
||||||
print("\n" + "=" * 50)
|
|
||||||
print("🎉 Sanity check 完成!")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user