diff --git a/README.md b/README.md index b6e6d2b..003b469 100755 --- a/README.md +++ b/README.md @@ -373,7 +373,7 @@ The script will print the recall and search time for each query, followed by the | System | DPR(2.1M docs) | RPJ-wiki(60M docs) | Chat history(5K messages) | | --------------------- | ---------------- | ---------------- | ---------------- | -| Traditional Vector DB | 3.8 GB | 201 GB | **22.8 MB** | +| Traditional Vector DB | 3.8 GB | 201 GB | 22.8 MB | | **LEANN** | **324 MB** | **6 GB** | **0.78 MB** | | **Reduction** | **91% smaller** | **97% smaller** | **97% smaller** | diff --git a/tests/sanity_checks/README.md b/test/sanity_checks/README.md similarity index 100% rename from tests/sanity_checks/README.md rename to test/sanity_checks/README.md diff --git a/tests/sanity_checks/benchmark_embeddings.py b/test/sanity_checks/benchmark_embeddings.py similarity index 100% rename from tests/sanity_checks/benchmark_embeddings.py rename to test/sanity_checks/benchmark_embeddings.py diff --git a/tests/sanity_checks/debug_zmq_issue.py b/test/sanity_checks/debug_zmq_issue.py similarity index 100% rename from tests/sanity_checks/debug_zmq_issue.py rename to test/sanity_checks/debug_zmq_issue.py diff --git a/tests/.DS_Store b/tests/.DS_Store deleted file mode 100644 index 18e12dd..0000000 Binary files a/tests/.DS_Store and /dev/null differ diff --git a/tests/sanity_checks/test_distance_functions.py b/tests/sanity_checks/test_distance_functions.py deleted file mode 100644 index 004619e..0000000 --- a/tests/sanity_checks/test_distance_functions.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 -""" -DiskANN 距离函数测试 -""" - -import os -from pathlib import Path -import shutil -import time - -# 导入后端包以触发插件注册 -try: - import leann_backend_diskann - import leann_backend_hnsw - print("INFO: Backend packages imported successfully.") -except ImportError as e: - print(f"WARNING: Could not import backend packages. Error: {e}") - -# 从 leann-core 导入上层 API -from leann.api import LeannBuilder, LeannSearcher - - -def load_sample_documents(): - """创建用于演示的样本文档""" - docs = [ - {"title": "Intro to Python", "content": "Python is a programming language for machine learning"}, - {"title": "ML Basics", "content": "Machine learning algorithms build intelligent systems"}, - {"title": "Data Structures", "content": "Data structures like arrays and graphs organize information"}, - ] - return docs - - -def test_distance_function(distance_func, test_name): - """测试特定距离函数""" - print(f"\n=== 测试 {test_name} ({distance_func}) ===") - - INDEX_DIR = Path(f"./test_indices_{distance_func}") - INDEX_PATH = str(INDEX_DIR / "documents.diskann") - - if INDEX_DIR.exists(): - shutil.rmtree(INDEX_DIR) - - # 构建索引 - print(f"构建索引 (距离函数: {distance_func})...") - builder = LeannBuilder( - backend_name="diskann", - distance_metric=distance_func, - graph_degree=16, - complexity=32 - ) - - documents = load_sample_documents() - for doc in documents: - builder.add_text(doc["content"], metadata=doc) - - try: - builder.build_index(INDEX_PATH) - print(f"✅ 索引构建成功") - - # 测试搜索 - searcher = LeannSearcher(INDEX_PATH, distance_metric=distance_func) - results = searcher.search("machine learning programming", top_k=2) - - print(f"搜索结果:") - for i, result in enumerate(results): - print(f" {i+1}. Score: {result['score']:.4f}") - print(f" Text: {result['text'][:50]}...") - - return True - - except Exception as e: - print(f"❌ 测试失败: {e}") - return False - - -def main(): - print("🔍 DiskANN 距离函数测试") - print("=" * 50) - - # 测试不同距离函数 - distance_tests = [ - ("mips", "Maximum Inner Product Search"), - ("l2", "L2 Euclidean Distance"), - ("cosine", "Cosine Similarity") - ] - - results = {} - for distance_func, test_name in distance_tests: - try: - success = test_distance_function(distance_func, test_name) - results[distance_func] = success - except Exception as e: - print(f"❌ {distance_func} 测试异常: {e}") - results[distance_func] = False - - # 总结 - print("\n" + "=" * 50) - print("📊 测试结果总结:") - for distance_func, success in results.items(): - status = "✅ 通过" if success else "❌ 失败" - print(f" {distance_func:10s}: {status}") - - print("\n🎉 测试完成!") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tests/sanity_checks/test_l2_verification.py b/tests/sanity_checks/test_l2_verification.py deleted file mode 100644 index 96c1b6c..0000000 --- a/tests/sanity_checks/test_l2_verification.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -""" -验证DiskANN L2距离是否真正工作 -""" - -import numpy as np -from pathlib import Path -import shutil - -# 导入后端包以触发插件注册 -try: - import leann_backend_diskann - print("INFO: Backend packages imported successfully.") -except ImportError as e: - print(f"WARNING: Could not import backend packages. Error: {e}") - -from leann.api import LeannBuilder, LeannSearcher - -def test_l2_verification(): - """验证L2距离是否真正被使用""" - print("=== 验证DiskANN L2距离实现 ===") - - INDEX_DIR = Path("./test_l2_verification") - INDEX_PATH = str(INDEX_DIR / "documents.diskann") - - if INDEX_DIR.exists(): - shutil.rmtree(INDEX_DIR) - - # 创建特殊的测试文档,使L2和cosine产生不同结果 - documents = [ - "machine learning artificial intelligence", # 文档0 - "computer programming software development", # 文档1 - "data science analytics statistics" # 文档2 - ] - - print("构建索引...") - builder = LeannBuilder( - backend_name="diskann", - distance_metric="l2", # 明确指定L2 - graph_degree=16, - complexity=32 - ) - - for i, doc in enumerate(documents): - builder.add_text(doc, metadata={"id": i, "text": doc}) - - builder.build_index(INDEX_PATH) - print("✅ 索引构建完成") - - # 测试搜索 - searcher = LeannSearcher(INDEX_PATH, distance_metric="l2") - - # 用一个与文档0非常相似的查询 - query = "machine learning AI technology" - results = searcher.search(query, top_k=3) - - print(f"\n查询: '{query}'") - print("L2距离搜索结果:") - for i, result in enumerate(results): - print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}") - print(f" Text: {result['text']}") - - # 现在用cosine重新测试同样的数据 - print(f"\n--- 用Cosine距离对比测试 ---") - - INDEX_DIR_COS = Path("./test_cosine_verification") - INDEX_PATH_COS = str(INDEX_DIR_COS / "documents.diskann") - - if INDEX_DIR_COS.exists(): - shutil.rmtree(INDEX_DIR_COS) - - builder_cos = LeannBuilder( - backend_name="diskann", - distance_metric="cosine", # 使用cosine - graph_degree=16, - complexity=32 - ) - - for i, doc in enumerate(documents): - builder_cos.add_text(doc, metadata={"id": i, "text": doc}) - - builder_cos.build_index(INDEX_PATH_COS) - - searcher_cos = LeannSearcher(INDEX_PATH_COS, distance_metric="cosine") - results_cos = searcher_cos.search(query, top_k=3) - - print("Cosine距离搜索结果:") - for i, result in enumerate(results_cos): - print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.6f}") - print(f" Text: {result['text']}") - - # 对比分析 - print(f"\n--- 结果对比分析 ---") - print("L2距离的分数是欧几里得距离平方,越小越相似") - print("Cosine距离的分数是余弦相似度的负值,越小越相似") - - l2_top = results[0] - cos_top = results_cos[0] - - print(f"L2最佳匹配: ID{l2_top['id']}, Score={l2_top['score']:.6f}") - print(f"Cosine最佳匹配: ID{cos_top['id']}, Score={cos_top['score']:.6f}") - - if l2_top['id'] == cos_top['id']: - print("✅ 两种距离函数返回相同的最佳匹配") - else: - print("⚠️ 两种距离函数返回不同的最佳匹配 - 这表明它们确实使用了不同的距离计算") - - # 验证分数范围的合理性 - l2_scores = [r['score'] for r in results] - cos_scores = [r['score'] for r in results_cos] - - print(f"L2分数范围: {min(l2_scores):.6f} 到 {max(l2_scores):.6f}") - print(f"Cosine分数范围: {min(cos_scores):.6f} 到 {max(cos_scores):.6f}") - - # L2分数应该是正数,cosine分数应该在-1到0之间(因为是负的相似度) - if all(score >= 0 for score in l2_scores): - print("✅ L2分数都是正数,符合预期") - else: - print("❌ L2分数有负数,可能有问题") - - if all(-1 <= score <= 0 for score in cos_scores): - print("✅ Cosine分数在合理范围内") - else: - print(f"⚠️ Cosine分数超出预期范围: {cos_scores}") - -if __name__ == "__main__": - test_l2_verification() \ No newline at end of file diff --git a/tests/sanity_checks/test_sanity_check.py b/tests/sanity_checks/test_sanity_check.py deleted file mode 100644 index 1c95b8c..0000000 --- a/tests/sanity_checks/test_sanity_check.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python3 -""" -Sanity check script for Leann DiskANN backend -Tests different distance functions and embedding models -""" - -import os -import numpy as np -from pathlib import Path -import shutil -import time - -# 导入后端包以触发插件注册 -import sys -sys.path.append('packages/leann-core/src') -sys.path.append('packages/leann-backend-diskann') -sys.path.append('packages/leann-backend-hnsw') - -try: - import leann_backend_diskann - import leann_backend_hnsw - print("INFO: Backend packages imported successfully.") -except ImportError as e: - print(f"WARNING: Could not import backend packages. Error: {e}") - -# 从 leann-core 导入上层 API -from leann.api import LeannBuilder, LeannSearcher - -def test_distance_functions(): - """测试不同的距离函数""" - print("\n=== 测试不同距离函数 ===") - - # 测试数据 - documents = [ - "Machine learning is a powerful technology", - "Deep learning uses neural networks", - "Artificial intelligence transforms industries" - ] - - distance_functions = ["mips", "l2", "cosine"] - - for distance_func in distance_functions: - print(f"\n[测试 {distance_func} 距离函数]") - try: - index_path = f"test_indices/test_{distance_func}.diskann" - if Path(index_path).parent.exists(): - shutil.rmtree(Path(index_path).parent) - - # 构建索引 - builder = LeannBuilder( - backend_name="diskann", - distance_metric=distance_func, - graph_degree=16, - complexity=32 - ) - - for doc in documents: - builder.add_text(doc) - - builder.build_index(index_path) - - # 测试搜索 - searcher = LeannSearcher(index_path, distance_metric=distance_func) - results = searcher.search("neural network technology", top_k=2) - - print(f"✅ {distance_func} 距离函数工作正常") - for i, result in enumerate(results): - print(f" {i+1}. Score: {result['score']:.4f}, Text: {result['text'][:50]}...") - - except Exception as e: - print(f"❌ {distance_func} 距离函数失败: {e}") - -def test_embedding_models(): - """测试不同的embedding模型""" - print("\n=== 测试不同Embedding模型 ===") - - documents = ["AI is transforming the world", "Technology advances rapidly"] - - # 测试不同的embedding模型 - models_to_test = [ - "sentence-transformers/all-mpnet-base-v2", - "sentence-transformers/all-MiniLM-L6-v2", - # "sentence-transformers/distilbert-base-nli-mean-tokens", # 可能不存在 - ] - - for model_name in models_to_test: - print(f"\n[测试 {model_name}]") - try: - index_path = f"test_indices/test_model.diskann" - if Path(index_path).parent.exists(): - shutil.rmtree(Path(index_path).parent) - - # 构建索引 - builder = LeannBuilder( - backend_name="diskann", - embedding_model=model_name, - distance_metric="cosine" - ) - - for doc in documents: - builder.add_text(doc) - - builder.build_index(index_path) - - # 测试搜索 - searcher = LeannSearcher(index_path) - results = searcher.search("artificial intelligence", top_k=1) - - print(f"✅ {model_name} 模型工作正常") - print(f" 结果: {results[0]['text'][:50]}...") - - except Exception as e: - print(f"❌ {model_name} 模型失败: {e}") - -def test_search_correctness(): - """验证搜索结果的正确性""" - print("\n=== 验证搜索结果正确性 ===") - - # 创建有明确相关性的测试文档 - documents = [ - "Python is a programming language used for machine learning", # 与编程相关 - "Dogs are loyal pets that love to play fetch", # 与动物相关 - "Machine learning algorithms can predict future trends", # 与ML相关 - "Cats are independent animals that sleep a lot", # 与动物相关 - "Deep learning neural networks process complex data" # 与ML相关 - ] - - try: - index_path = "test_indices/correctness_test.diskann" - if Path(index_path).parent.exists(): - shutil.rmtree(Path(index_path).parent) - - # 构建索引 - builder = LeannBuilder( - backend_name="diskann", - distance_metric="cosine" - ) - - for doc in documents: - builder.add_text(doc) - - builder.build_index(index_path) - - # 测试相关性查询 - searcher = LeannSearcher(index_path) - - test_queries = [ - ("machine learning programming", [0, 2, 4]), # 应该返回ML相关文档 - ("pet animals behavior", [1, 3]), # 应该返回动物相关文档 - ] - - for query, expected_topics in test_queries: - print(f"\n查询: '{query}'") - results = searcher.search(query, top_k=3) - - print("搜索结果:") - for i, result in enumerate(results): - print(f" {i+1}. ID:{result['id']}, Score:{result['score']:.4f}") - print(f" Text: {result['text'][:60]}...") - - # 简单验证:检查前两个结果是否在预期范围内 - top_ids = [result['id'] for result in results[:2]] - relevant_found = any(id in expected_topics for id in top_ids) - - if relevant_found: - print("✅ 搜索结果相关性正确") - else: - print("⚠️ 搜索结果相关性可能有问题") - - except Exception as e: - print(f"❌ 正确性测试失败: {e}") - -def main(): - print("🔍 Leann DiskANN Sanity Check") - print("=" * 50) - - # 清理旧的测试数据 - if Path("test_indices").exists(): - shutil.rmtree("test_indices") - - # 运行测试 - test_distance_functions() - test_embedding_models() - test_search_correctness() - - print("\n" + "=" * 50) - print("🎉 Sanity check 完成!") - -if __name__ == "__main__": - main() \ No newline at end of file