#!/usr/bin/env python3 """ This script runs a recall evaluation on a given LEANN index. It correctly compares results by fetching the text content for both the new search results and the golden standard results, making the comparison robust to ID changes. """ import json import argparse import time from pathlib import Path import sys import numpy as np from typing import List from leann.api import LeannSearcher def download_data_if_needed(data_root: Path): """Checks if the data directory exists, and if not, downloads it from HF Hub.""" if not data_root.exists(): print(f"Data directory '{data_root}' not found.") print( "Downloading evaluation data from Hugging Face Hub... (this may take a moment)" ) try: from huggingface_hub import snapshot_download snapshot_download( repo_id="LEANN-RAG/leann-rag-evaluation-data", repo_type="dataset", local_dir=data_root, local_dir_use_symlinks=False, # Recommended for Windows compatibility and simpler structure ) print("Data download complete!") except ImportError: print( "Error: huggingface_hub is not installed. Please install it to download the data:" ) print("uv pip install -e '.[dev]'") sys.exit(1) except Exception as e: print(f"An error occurred during data download: {e}") sys.exit(1) # --- Helper Function to get Golden Passages --- def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set: """ Retrieves the text for golden passage IDs directly from the LeannSearcher's passage manager. """ golden_texts = set() for gid in golden_ids: try: # PassageManager uses string IDs passage_data = searcher.passage_manager.get_passage(str(gid)) golden_texts.add(passage_data["text"]) except KeyError: print( f"Warning: Golden passage ID '{gid}' not found in the index's passage data." ) return golden_texts def load_queries(file_path: Path) -> List[str]: queries = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: data = json.loads(line) queries.append(data["query"]) return queries def main(): parser = argparse.ArgumentParser( description="Run recall evaluation on a LEANN index." ) parser.add_argument( "index_path", type=str, help="Path to the LEANN index to evaluate." ) parser.add_argument( "--num-queries", type=int, default=10, help="Number of queries to evaluate." ) parser.add_argument( "--top-k", type=int, default=3, help="The 'k' value for recall@k." ) parser.add_argument( "--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW." ) args = parser.parse_args() # --- Path Configuration --- # Assumes a project structure where the script is in 'examples/' # and data is in 'data/' at the project root. project_root = Path(__file__).resolve().parent.parent data_root = project_root / "data" # Automatically download data if it doesn't exist download_data_if_needed(data_root) # Detect dataset type from index path to select the correct ground truth index_path_str = str(args.index_path) if "rpj_wiki" in index_path_str: dataset_type = "rpj_wiki" elif "dpr" in index_path_str: dataset_type = "dpr" else: # Fallback: try to infer from the index directory name dataset_type = Path(args.index_path).name print( f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'." ) queries_file = data_root / "queries" / "nq_open.jsonl" golden_results_file = ( data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json" ) print(f"INFO: Detected dataset type: {dataset_type}") print(f"INFO: Using queries file: {queries_file}") print(f"INFO: Using ground truth file: {golden_results_file}") try: searcher = LeannSearcher(args.index_path) queries = load_queries(queries_file) with open(golden_results_file, "r") as f: golden_results_data = json.load(f) num_eval_queries = min(args.num_queries, len(queries)) queries = queries[:num_eval_queries] print(f"\nRunning evaluation on {num_eval_queries} queries...") recall_scores = [] search_times = [] for i in range(num_eval_queries): start_time = time.time() new_results = searcher.search( queries[i], top_k=args.top_k, ef=args.ef_search ) search_times.append(time.time() - start_time) # Correct Recall Calculation: Based on TEXT content new_texts = {result.text for result in new_results} # Get golden texts directly from the searcher's passage manager golden_ids = golden_results_data["indices"][i][: args.top_k] golden_texts = get_golden_texts(searcher, golden_ids) overlap = len(new_texts & golden_texts) recall = overlap / len(golden_texts) if golden_texts else 0 recall_scores.append(recall) print("\n--- EVALUATION RESULTS ---") print(f"Query: {queries[i]}") print(f"New Results: {new_texts}") print(f"Golden Results: {golden_texts}") print(f"Overlap: {overlap}") print(f"Recall: {recall}") print(f"Search Time: {search_times[-1]:.4f}s") print("--------------------------------") avg_recall = np.mean(recall_scores) if recall_scores else 0 avg_time = np.mean(search_times) if search_times else 0 print("\nšŸŽ‰ --- Evaluation Complete ---") print(f"Avg. Recall@{args.top_k} (efSearch={args.ef_search}): {avg_recall:.4f}") print(f"Avg. Search Time: {avg_time:.4f}s") except Exception as e: print(f"\nāŒ An error occurred during evaluation: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()