update exp for vldb

This commit is contained in:
yichuan-w
2025-08-22 14:29:36 -07:00
parent 31b4973141
commit 14f096dfe3
12 changed files with 3400 additions and 42 deletions

View File

@@ -203,6 +203,15 @@ def main():
default=0,
help="Batch size for HNSW batched search (0 disables batching)",
)
parser.add_argument(
"--queries-file",
type=str,
default="nq_open.jsonl",
help=(
"Queries file to use. Provide a filename under benchmarks/data/queries "
"or an absolute path to a .jsonl file (default: nq_open.jsonl)."
),
)
parser.add_argument(
"--llm-type",
type=str,
@@ -314,8 +323,52 @@ def main():
dataset_type = Path(args.index_path).name
print(f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'.")
queries_file = data_root / "queries" / "nq_open.jsonl"
golden_results_file = data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
# Resolve queries file (supports absolute path or name under data/queries)
queries_file_candidate = Path(args.queries_file)
if queries_file_candidate.is_absolute():
queries_file = queries_file_candidate
else:
queries_file = data_root / "queries" / args.queries_file
if not queries_file.exists():
print(f"Error: Queries file not found: {queries_file}")
print("Tip: Use --queries-file with a filename under benchmarks/data/queries or an absolute path.")
sys.exit(1)
# Infer ground-truth file from the queries filename
qname = queries_file.name.lower()
if "hotpot" in qname:
task_key = "hotpot"
elif "trivia" in qname:
task_key = "trivia"
elif "gpqa" in qname:
task_key = "gpqa"
elif "nq" in qname:
task_key = "nq"
else:
print(
"Error: Could not infer task from queries filename. Supported names include 'nq', 'hotpot', 'trivia', 'gpqa'."
)
print(f"Filename was: {queries_file.name}")
sys.exit(1)
golden_results_file = data_root / "ground_truth" / dataset_type / f"flat_results_{task_key}_k3.json"
if not golden_results_file.exists():
gt_dir = data_root / "ground_truth" / dataset_type
try:
available = sorted(p.name for p in gt_dir.glob("flat_results_*_k3.json"))
except Exception:
available = []
print(
f"Error: Ground truth file not found for task '{task_key}' under dataset '{dataset_type}': {golden_results_file}"
)
if available:
print("Available ground truth files:")
for name in available:
print(f" - {name}")
else:
print(f"No ground truth files found in {gt_dir}")
sys.exit(1)
print(f"INFO: Detected dataset type: {dataset_type}")
print(f"INFO: Using queries file: {queries_file}")
@@ -346,15 +399,15 @@ def main():
search_times.append(time.time() - start_time)
# Optional: also call the LLM with configurable backend/model (does not affect recall)
llm_config = {"type": args.llm_type, "model": args.llm_model}
chat = LeannChat(args.index_path, llm_config=llm_config, searcher=searcher)
answer = chat.ask(
queries[i],
top_k=args.top_k,
complexity=args.ef_search,
batch_size=args.batch_size,
)
print(f"Answer: {answer}")
# llm_config = {"type": args.llm_type, "model": args.llm_model}
# chat = LeannChat(args.index_path, llm_config=llm_config, searcher=searcher)
# answer = chat.ask(
# queries[i],
# top_k=args.top_k,
# complexity=args.ef_search,
# batch_size=args.batch_size,
# )
# print(f"Answer: {answer}")
# Correct Recall Calculation: Based on TEXT content
new_texts = {result.text for result in new_results}
@@ -378,10 +431,16 @@ def main():
avg_recall = np.mean(recall_scores) if recall_scores else 0
avg_time = np.mean(search_times) if search_times else 0
print(f"search time: {search_times}")
print("\n🎉 --- Evaluation Complete ---")
print(f"Avg. Recall@{args.top_k} (efSearch={args.ef_search}): {avg_recall:.4f}")
print(f"Avg. Search Time: {avg_time:.4f}s")
# avg last 10 search times
avg_last_10_search_times = np.mean(search_times[-10:])
print(f"Avg. Last 10 Search Times: {avg_last_10_search_times:.4f}s")
except Exception as e:
print(f"\n❌ An error occurred during evaluation: {e}")
import traceback