clean dict

This commit is contained in:
yichuan520030910320
2025-07-15 22:30:52 -07:00
parent b1c93fe178
commit 4a2cb914d7
8 changed files with 1 additions and 425 deletions

View File

@@ -373,7 +373,7 @@ The script will print the recall and search time for each query, followed by the
| System | DPR(2.1M docs) | RPJ-wiki(60M docs) | Chat history(5K messages) |
| --------------------- | ---------------- | ---------------- | ---------------- |
| Traditional Vector DB | 3.8 GB | 201 GB | **22.8 MB** |
| Traditional Vector DB | 3.8 GB | 201 GB | 22.8 MB |
| **LEANN** | **324 MB** | **6 GB** | **0.78 MB** |
| **Reduction** | **91% smaller** | **97% smaller** | **97% smaller** |

BIN
tests/.DS_Store vendored
View File

Binary file not shown.

View File

@@ -1,107 +0,0 @@
#!/usr/bin/env python3
"""
DiskANN 距离函数测试
"""
import os
from pathlib import Path
import shutil
import time
# 导入后端包以触发插件注册
try:
import leann_backend_diskann
import leann_backend_hnsw
print("INFO: Backend packages imported successfully.")
except ImportError as e:
print(f"WARNING: Could not import backend packages. Error: {e}")
# 从 leann-core 导入上层 API
from leann.api import LeannBuilder, LeannSearcher
def load_sample_documents():
"""创建用于演示的样本文档"""
docs = [
{"title": "Intro to Python", "content": "Python is a programming language for machine learning"},
{"title": "ML Basics", "content": "Machine learning algorithms build intelligent systems"},
{"title": "Data Structures", "content": "Data structures like arrays and graphs organize information"},
]
return docs
def test_distance_function(distance_func, test_name):
"""测试特定距离函数"""
print(f"\n=== 测试 {test_name} ({distance_func}) ===")
INDEX_DIR = Path(f"./test_indices_{distance_func}")
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
if INDEX_DIR.exists():
shutil.rmtree(INDEX_DIR)
# 构建索引
print(f"构建索引 (距离函数: {distance_func})...")
builder = LeannBuilder(
backend_name="diskann",
distance_metric=distance_func,
graph_degree=16,
complexity=32
)
documents = load_sample_documents()
for doc in documents:
builder.add_text(doc["content"], metadata=doc)
try:
builder.build_index(INDEX_PATH)
print(f"✅ 索引构建成功")
# 测试搜索
searcher = LeannSearcher(INDEX_PATH, distance_metric=distance_func)
results = searcher.search("machine learning programming", top_k=2)
print(f"搜索结果:")
for i, result in enumerate(results):
print(f" {i+1}. Score: {result['score']:.4f}")
print(f" Text: {result['text'][:50]}...")
return True
except Exception as e:
print(f"❌ 测试失败: {e}")
return False
def main():
print("🔍 DiskANN 距离函数测试")
print("=" * 50)
# 测试不同距离函数
distance_tests = [
("mips", "Maximum Inner Product Search"),
("l2", "L2 Euclidean Distance"),
("cosine", "Cosine Similarity")
]
results = {}
for distance_func, test_name in distance_tests:
try:
success = test_distance_function(distance_func, test_name)
results[distance_func] = success
except Exception as e:
print(f"{distance_func} 测试异常: {e}")
results[distance_func] = False
# 总结
print("\n" + "=" * 50)
print("📊 测试结果总结:")
for distance_func, success in results.items():
status = "✅ 通过" if success else "❌ 失败"
print(f" {distance_func:10s}: {status}")
print("\n🎉 测试完成!")
if __name__ == "__main__":
main()

View File

@@ -1,127 +0,0 @@
#!/usr/bin/env python3
"""
验证DiskANN L2距离是否真正工作
"""
import numpy as np
from pathlib import Path
import shutil
# 导入后端包以触发插件注册
try:
import leann_backend_diskann
print("INFO: Backend packages imported successfully.")
except ImportError as e:
print(f"WARNING: Could not import backend packages. Error: {e}")
from leann.api import LeannBuilder, LeannSearcher
def test_l2_verification():
"""验证L2距离是否真正被使用"""
print("=== 验证DiskANN L2距离实现 ===")
INDEX_DIR = Path("./test_l2_verification")
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
if INDEX_DIR.exists():
shutil.rmtree(INDEX_DIR)
# 创建特殊的测试文档使L2和cosine产生不同结果
documents = [
"machine learning artificial intelligence", # 文档0
"computer programming software development", # 文档1
"data science analytics statistics" # 文档2
]
print("构建索引...")
builder = LeannBuilder(
backend_name="diskann",
distance_metric="l2", # 明确指定L2
graph_degree=16,
complexity=32
)
for i, doc in enumerate(documents):
builder.add_text(doc, metadata={"id": i, "text": doc})
builder.build_index(INDEX_PATH)
print("✅ 索引构建完成")
# 测试搜索
searcher = LeannSearcher(INDEX_PATH, distance_metric="l2")
# 用一个与文档0非常相似的查询
query = "machine learning AI technology"
results = searcher.search(query, top_k=3)
print(f"\n查询: '{query}'")
print("L2距离搜索结果:")
for i, result in enumerate(results):
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}")
print(f" Text: {result['text']}")
# 现在用cosine重新测试同样的数据
print(f"\n--- 用Cosine距离对比测试 ---")
INDEX_DIR_COS = Path("./test_cosine_verification")
INDEX_PATH_COS = str(INDEX_DIR_COS / "documents.diskann")
if INDEX_DIR_COS.exists():
shutil.rmtree(INDEX_DIR_COS)
builder_cos = LeannBuilder(
backend_name="diskann",
distance_metric="cosine", # 使用cosine
graph_degree=16,
complexity=32
)
for i, doc in enumerate(documents):
builder_cos.add_text(doc, metadata={"id": i, "text": doc})
builder_cos.build_index(INDEX_PATH_COS)
searcher_cos = LeannSearcher(INDEX_PATH_COS, distance_metric="cosine")
results_cos = searcher_cos.search(query, top_k=3)
print("Cosine距离搜索结果:")
for i, result in enumerate(results_cos):
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}")
print(f" Text: {result['text']}")
# 对比分析
print(f"\n--- 结果对比分析 ---")
print("L2距离的分数是欧几里得距离平方越小越相似")
print("Cosine距离的分数是余弦相似度的负值越小越相似")
l2_top = results[0]
cos_top = results_cos[0]
print(f"L2最佳匹配: ID{l2_top['id']}, Score={l2_top['score']:.6f}")
print(f"Cosine最佳匹配: ID{cos_top['id']}, Score={cos_top['score']:.6f}")
if l2_top['id'] == cos_top['id']:
print("✅ 两种距离函数返回相同的最佳匹配")
else:
print("⚠️ 两种距离函数返回不同的最佳匹配 - 这表明它们确实使用了不同的距离计算")
# 验证分数范围的合理性
l2_scores = [r['score'] for r in results]
cos_scores = [r['score'] for r in results_cos]
print(f"L2分数范围: {min(l2_scores):.6f}{max(l2_scores):.6f}")
print(f"Cosine分数范围: {min(cos_scores):.6f}{max(cos_scores):.6f}")
# L2分数应该是正数cosine分数应该在-1到0之间因为是负的相似度
if all(score >= 0 for score in l2_scores):
print("✅ L2分数都是正数符合预期")
else:
print("❌ L2分数有负数可能有问题")
if all(-1 <= score <= 0 for score in cos_scores):
print("✅ Cosine分数在合理范围内")
else:
print(f"⚠️ Cosine分数超出预期范围: {cos_scores}")
if __name__ == "__main__":
test_l2_verification()

View File

@@ -1,190 +0,0 @@
#!/usr/bin/env python3
"""
Sanity check script for Leann DiskANN backend
Tests different distance functions and embedding models
"""
import os
import numpy as np
from pathlib import Path
import shutil
import time
# 导入后端包以触发插件注册
import sys
sys.path.append('packages/leann-core/src')
sys.path.append('packages/leann-backend-diskann')
sys.path.append('packages/leann-backend-hnsw')
try:
import leann_backend_diskann
import leann_backend_hnsw
print("INFO: Backend packages imported successfully.")
except ImportError as e:
print(f"WARNING: Could not import backend packages. Error: {e}")
# 从 leann-core 导入上层 API
from leann.api import LeannBuilder, LeannSearcher
def test_distance_functions():
"""测试不同的距离函数"""
print("\n=== 测试不同距离函数 ===")
# 测试数据
documents = [
"Machine learning is a powerful technology",
"Deep learning uses neural networks",
"Artificial intelligence transforms industries"
]
distance_functions = ["mips", "l2", "cosine"]
for distance_func in distance_functions:
print(f"\n[测试 {distance_func} 距离函数]")
try:
index_path = f"test_indices/test_{distance_func}.diskann"
if Path(index_path).parent.exists():
shutil.rmtree(Path(index_path).parent)
# 构建索引
builder = LeannBuilder(
backend_name="diskann",
distance_metric=distance_func,
graph_degree=16,
complexity=32
)
for doc in documents:
builder.add_text(doc)
builder.build_index(index_path)
# 测试搜索
searcher = LeannSearcher(index_path, distance_metric=distance_func)
results = searcher.search("neural network technology", top_k=2)
print(f"{distance_func} 距离函数工作正常")
for i, result in enumerate(results):
print(f" {i+1}. Score: {result['score']:.4f}, Text: {result['text'][:50]}...")
except Exception as e:
print(f"{distance_func} 距离函数失败: {e}")
def test_embedding_models():
"""测试不同的embedding模型"""
print("\n=== 测试不同Embedding模型 ===")
documents = ["AI is transforming the world", "Technology advances rapidly"]
# 测试不同的embedding模型
models_to_test = [
"sentence-transformers/all-mpnet-base-v2",
"sentence-transformers/all-MiniLM-L6-v2",
# "sentence-transformers/distilbert-base-nli-mean-tokens", # 可能不存在
]
for model_name in models_to_test:
print(f"\n[测试 {model_name}]")
try:
index_path = f"test_indices/test_model.diskann"
if Path(index_path).parent.exists():
shutil.rmtree(Path(index_path).parent)
# 构建索引
builder = LeannBuilder(
backend_name="diskann",
embedding_model=model_name,
distance_metric="cosine"
)
for doc in documents:
builder.add_text(doc)
builder.build_index(index_path)
# 测试搜索
searcher = LeannSearcher(index_path)
results = searcher.search("artificial intelligence", top_k=1)
print(f"{model_name} 模型工作正常")
print(f" 结果: {results[0]['text'][:50]}...")
except Exception as e:
print(f"{model_name} 模型失败: {e}")
def test_search_correctness():
"""验证搜索结果的正确性"""
print("\n=== 验证搜索结果正确性 ===")
# 创建有明确相关性的测试文档
documents = [
"Python is a programming language used for machine learning", # 与编程相关
"Dogs are loyal pets that love to play fetch", # 与动物相关
"Machine learning algorithms can predict future trends", # 与ML相关
"Cats are independent animals that sleep a lot", # 与动物相关
"Deep learning neural networks process complex data" # 与ML相关
]
try:
index_path = "test_indices/correctness_test.diskann"
if Path(index_path).parent.exists():
shutil.rmtree(Path(index_path).parent)
# 构建索引
builder = LeannBuilder(
backend_name="diskann",
distance_metric="cosine"
)
for doc in documents:
builder.add_text(doc)
builder.build_index(index_path)
# 测试相关性查询
searcher = LeannSearcher(index_path)
test_queries = [
("machine learning programming", [0, 2, 4]), # 应该返回ML相关文档
("pet animals behavior", [1, 3]), # 应该返回动物相关文档
]
for query, expected_topics in test_queries:
print(f"\n查询: '{query}'")
results = searcher.search(query, top_k=3)
print("搜索结果:")
for i, result in enumerate(results):
print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.4f}")
print(f" Text: {result['text'][:60]}...")
# 简单验证:检查前两个结果是否在预期范围内
top_ids = [result['id'] for result in results[:2]]
relevant_found = any(id in expected_topics for id in top_ids)
if relevant_found:
print("✅ 搜索结果相关性正确")
else:
print("⚠️ 搜索结果相关性可能有问题")
except Exception as e:
print(f"❌ 正确性测试失败: {e}")
def main():
print("🔍 Leann DiskANN Sanity Check")
print("=" * 50)
# 清理旧的测试数据
if Path("test_indices").exists():
shutil.rmtree("test_indices")
# 运行测试
test_distance_functions()
test_embedding_models()
test_search_correctness()
print("\n" + "=" * 50)
print("🎉 Sanity check 完成!")
if __name__ == "__main__":
main()