diff --git a/benchmarks/run_evaluation.py b/benchmarks/run_evaluation.py
index a018d98..ab4e169 100644
--- a/benchmarks/run_evaluation.py
+++ b/benchmarks/run_evaluation.py
@@ -197,6 +197,12 @@ def main():
     parser.add_argument(
         "--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
     )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=0,
+        help="Batch size for HNSW batched search (0 disables batching)",
+    )
     parser.add_argument(
         "--llm-type",
         type=str,
@@ -331,13 +337,23 @@ def main():
 
         for i in range(num_eval_queries):
             start_time = time.time()
-            new_results = searcher.search(queries[i], top_k=args.top_k, complexity=args.ef_search)
+            new_results = searcher.search(
+                queries[i],
+                top_k=args.top_k,
+                complexity=args.ef_search,
+                batch_size=args.batch_size,
+            )
             search_times.append(time.time() - start_time)
 
             # Optional: also call the LLM with configurable backend/model (does not affect recall)
             llm_config = {"type": args.llm_type, "model": args.llm_model}
             chat = LeannChat(args.index_path, llm_config=llm_config, searcher=searcher)
-            answer = chat.ask(queries[i], top_k=args.top_k, complexity=args.ef_search)
+            answer = chat.ask(
+                queries[i],
+                top_k=args.top_k,
+                complexity=args.ef_search,
+                batch_size=args.batch_size,
+            )
             print(f"Answer: {answer}")
             # Correct Recall Calculation: Based on TEXT content
             new_texts = {result.text for result in new_results}