From a0d6857faabcc4749cc2172c7d4b3d9ebd1e6e66 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 15 Sep 2025 19:50:02 -0700
Subject: [PATCH] docs: data updated

---
 benchmarks/enron_emails/README.md             |  67 ++--
 .../enron_emails/evaluate_enron_emails.py     | 113 ++++++-
 benchmarks/financebench/README.md             |  24 +-
 .../financebench/evaluate_financebench.py     | 138 ++++----
 benchmarks/laion/README.md                    | 118 ++++---
 benchmarks/laion/evaluate_laion.py            |  78 ++++-
 benchmarks/llm_utils.py                       | 301 ++++++++++++++++++
 pyproject.toml                                |   1 +
 uv.lock                                       |  42 ++-
 9 files changed, 749 insertions(+), 133 deletions(-)
 create mode 100644 benchmarks/llm_utils.py

diff --git a/benchmarks/enron_emails/README.md b/benchmarks/enron_emails/README.md
index 16d2271..fdeae69 100644
--- a/benchmarks/enron_emails/README.md
+++ b/benchmarks/enron_emails/README.md
@@ -1,18 +1,19 @@
 # Enron Emails Benchmark
 
-A retrieval-only benchmark for evaluating LEANN search on the Enron email corpus. It mirrors the structure and CLI of the existing FinanceBench and LAION benches, using stage-based evaluation focused on Recall@3.
+A comprehensive RAG benchmark for evaluating LEANN search and generation on the Enron email corpus. It mirrors the structure and CLI of the existing FinanceBench and LAION benches, using stage-based evaluation with Recall@3 and generation timing.
 
 - Dataset: Enron email CSV (e.g., Kaggle wcukierski/enron-email-dataset) for passages
 - Queries: corbt/enron_emails_sample_questions (filtered for realistic questions)
-- Metric: Recall@3 vs FAISS Flat baseline
+- Metrics: Recall@3 vs FAISS Flat baseline + Generation evaluation with Qwen3-8B
 
 ## Layout
 
 benchmarks/enron_emails/
 - setup_enron_emails.py: Prepare passages, build LEANN index, build FAISS baseline
-- evaluate_enron_emails.py: Evaluate retrieval recall (Stage 2)
+- evaluate_enron_emails.py: Evaluate retrieval recall (Stages 2-5) + generation with Qwen3-8B
 - data/: Generated passages, queries, embeddings-related files
 - baseline/: FAISS Flat baseline files
+- llm_utils.py: LLM utilities for Qwen3-8B generation (in parent directory)
 
 ## Quickstart
 
@@ -41,23 +42,33 @@ Stage 3 uses binary search over complexity to find the minimal value achieving t
 
 4) Index comparison (Stage 4)
 
-python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --max-queries 100 --output results.json
+python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --complexity 88 --max-queries 100 --output results.json
+
+5) Generation evaluation (Stage 5)
+
+python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 5 --complexity 88 --llm-backend hf --model-name Qwen/Qwen3-8B
+
+6) Combined index + generation evaluation (Stages 4+5, recommended)
+
+python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 45 --complexity 88 --llm-backend hf
 
 Notes:
 - Minimal CLI: you can run from repo root with only `--index`, defaults match financebench/laion patterns:
-  - `--stage` defaults to `all` (runs 2, 3, 4)
+  - `--stage` defaults to `all` (runs 2, 3, 4, 5)
   - `--baseline-dir` defaults to `baseline`
   - `--queries` defaults to `data/evaluation_queries.jsonl` (or falls back to the index directory)
+  - `--llm-backend` defaults to `hf` (HuggingFace), can use `vllm`
+  - `--model-name` defaults to `Qwen/Qwen3-8B`
 - Fail-fast behavior: no silent fallbacks. If compact index cannot run with recompute, it errors out.
-
-4) Index comparison (Stage 4)
-
-python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --max-queries 100 --output results.json
+- Stage 5 requires Stage 4 retrieval results. Use `--stage 45` to run both efficiently.
 
 Optional flags:
 - --queries data/evaluation_queries.jsonl (custom queries file)
 - --baseline-dir baseline (where FAISS baseline lives)
-- --complexity 64 (LEANN complexity parameter)
+- --complexity 88 (LEANN complexity parameter, optimal for 90% recall)
+- --llm-backend hf|vllm (LLM backend for generation)
+- --model-name Qwen/Qwen3-8B (LLM model for generation)
+- --max-queries 1000 (limit number of queries for evaluation)
 
 ## Files Produced
 - data/enron_passages_preview.jsonl: Small preview of passages used (for inspection)
@@ -66,8 +77,9 @@ Optional flags:
 - data/evaluation_queries.jsonl: Query file (id + query; includes GT IDs for reference)
 
 ## Notes
-- We only evaluate retrieval Recall@3 (no generation). This matches the other benches’ style and stage flow.
+- Evaluates both retrieval Recall@3 and generation timing with Qwen3-8B thinking model.
 - The emails CSV must contain a column named "message" (raw RFC822 email) and a column named "file" for source identifier. Message-ID headers are parsed as canonical message IDs when present.
+- Qwen3-8B requires special handling for thinking models with chat templates and <think></think> tag processing.
 
 ## Stages Summary
 
@@ -80,16 +92,23 @@ Optional flags:
 
 - Stage 4 (Index Comparison):
   - Reports .index-only sizes for compact vs non-compact.
-  - Measures timings on 100 queries by default: non-compact (no recompute) vs compact (with recompute).
+  - Measures timings on queries by default: non-compact (no recompute) vs compact (with recompute).
+  - Stores retrieval results for Stage 5 generation evaluation.
   - Fails fast if compact recompute cannot run.
   - If `--complexity` is not provided, the script tries to use the best complexity from Stage 3:
     - First from the current run (when running `--stage all`), otherwise
     - From `enron_stage3_results.json` saved next to the index during the last Stage 3 run.
     - If neither exists, Stage 4 will error and ask you to run Stage 3 or pass `--complexity`.
 
+- Stage 5 (Generation Evaluation):
+  - Uses Qwen3-8B thinking model for RAG generation on retrieved documents from Stage 4.
+  - Supports HuggingFace (`hf`) and vLLM (`vllm`) backends.
+  - Measures generation timing separately from search timing.
+  - Requires Stage 4 results (no additional searching performed).
+
 ## Example Results
 
-These are sample results obtained on a subset of Enron data using all-mpnet-base-v2.
+These are sample results obtained on Enron data using all-mpnet-base-v2 and Qwen3-8B.
 
 - Stage 3 (Binary Search):
   - Minimal complexity achieving 90% Recall@3: 88
@@ -103,14 +122,20 @@ These are sample results obtained on a subset of Enron data using all-mpnet-base
     - C=256 → 92.0% Recall@3
 
 - Stage 4 (Index Sizes, .index only):
-  - Compact: ~2.17 MB
-  - Non-compact: ~82.03 MB
-  - Storage saving by compact: ~97.35%
+  - Compact: ~2.2 MB
+  - Non-compact: ~82.0 MB
+  - Storage saving by compact: ~97.3%
 
-- Stage 4 (Timing, 100 queries, complexity=88):
-  - Non-compact (no recompute): ~0.0074 s avg per query
-  - Compact (with recompute): ~1.947 s avg per query
+- Stage 4 (Search Timing, 988 queries, complexity=88):
+  - Non-compact (no recompute): ~0.0075 s avg per query
+  - Compact (with recompute): ~1.981 s avg per query
   - Speed ratio (non-compact/compact): ~0.0038x
 
-Full JSON output for Stage 4 is saved by the script (see `--output`), e.g.:
-`benchmarks/enron_emails/results_enron_stage4.json`.
+- Stage 5 (RAG Generation, 988 queries, Qwen3-8B):
+  - Average generation time: ~22.302 s per query
+  - Total queries processed: 988
+  - LLM backend: HuggingFace transformers
+  - Model: Qwen/Qwen3-8B (thinking model with <think></think> processing)
+
+Full JSON output is saved by the script (see `--output`), e.g.:
+`benchmarks/enron_emails/results_enron_stage45.json`.
diff --git a/benchmarks/enron_emails/evaluate_enron_emails.py b/benchmarks/enron_emails/evaluate_enron_emails.py
index e8c3db0..c6e8518 100644
--- a/benchmarks/enron_emails/evaluate_enron_emails.py
+++ b/benchmarks/enron_emails/evaluate_enron_emails.py
@@ -7,13 +7,22 @@ On errors, fail fast without fallbacks.
 
 import argparse
 import json
+import logging
 import os
 import pickle
+from pathlib import Path
 
 import numpy as np
 from leann import LeannBuilder, LeannSearcher
 from leann_backend_hnsw import faiss
 
+from ..llm_utils import generate_hf, generate_vllm, load_hf_model, load_vllm_model
+
+# Setup logging to reduce verbose output
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("leann.api").setLevel(logging.WARNING)
+logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING)
+
 
 class RecallEvaluator:
     """Stage 2: Evaluate Recall@3 (LEANN vs FAISS)"""
@@ -119,7 +128,6 @@ class EnronEvaluator:
 
     def analyze_index_sizes(self) -> dict:
         """Analyze index sizes (.index only), similar to LAION bench."""
-        from pathlib import Path
 
         print("📏 Analyzing index sizes (.index only)...")
         index_path = Path(self.index_path)
@@ -150,7 +158,6 @@ class EnronEvaluator:
 
     def create_non_compact_index_for_comparison(self, non_compact_index_path: str) -> dict:
         """Create a non-compact index for comparison using current passages and embeddings."""
-        from pathlib import Path
 
         current_index_path = Path(self.index_path)
         current_index_dir = current_index_path.parent
@@ -230,6 +237,7 @@ class EnronEvaluator:
             "compact": {"search_times": []},
             "avg_search_times": {},
             "speed_ratio": 0.0,
+            "retrieval_results": [],  # Store retrieval results for Stage 5
         }
 
         print("⚡ Comparing search performance between indexes...")
@@ -248,10 +256,15 @@ class EnronEvaluator:
         compact_searcher = LeannSearcher(compact_path)
         for q in test_queries:
             t0 = time.time()
-            _ = compact_searcher.search(
+            docs = compact_searcher.search(
                 q, top_k=3, complexity=complexity, recompute_embeddings=True
             )
             results["compact"]["search_times"].append(time.time() - t0)
+
+            # Store retrieval results for Stage 5
+            results["retrieval_results"].append(
+                {"query": q, "retrieved_docs": [{"id": doc.id, "text": doc.text} for doc in docs]}
+            )
         compact_searcher.cleanup()
 
         if results["non_compact"]["search_times"]:
@@ -358,9 +371,9 @@ def main():
     )
     parser.add_argument(
         "--stage",
-        choices=["2", "3", "4", "all"],
+        choices=["2", "3", "4", "5", "all", "45"],
         default="all",
-        help="Which stage to run (2=recall, 3=complexity, 4=index comparison)",
+        help="Which stage to run (2=recall, 3=complexity, 4=index comparison, 5=generation)",
     )
     parser.add_argument("--complexity", type=int, default=None, help="LEANN search complexity")
     parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory")
@@ -371,6 +384,8 @@ def main():
         "--target-recall", type=float, default=0.90, help="Target Recall@3 for Stage 3"
     )
     parser.add_argument("--output", help="Save results to JSON file")
+    parser.add_argument("--llm-backend", choices=["hf", "vllm"], default="hf", help="LLM backend")
+    parser.add_argument("--model-name", default="Qwen/Qwen3-8B", help="Model name")
 
     args = parser.parse_args()
 
@@ -438,7 +453,7 @@ def main():
         enron_eval.cleanup()
         print("✅ Stage 3 completed!\n")
 
-    if args.stage in ("4", "all"):
+    if args.stage in ("4", "all", "45"):
         print("🚀 Starting Stage 4: Index size + performance comparison")
         evaluator = RecallEvaluator(args.index, args.baseline_dir)
         enron_eval = EnronEvaluator(args.index)
@@ -503,6 +518,92 @@ def main():
         enron_eval.cleanup()
         print("✅ Stage 4 completed!\n")
 
+    if args.stage in ("5", "all"):
+        print("🚀 Starting Stage 5: Generation evaluation with Qwen3-8B")
+
+        # Check if Stage 4 results exist
+        if "stage4" not in results_out or "performance_comparison" not in results_out["stage4"]:
+            print("❌ Stage 5 requires Stage 4 retrieval results")
+            print("💡 Run Stage 4 first or use --stage all")
+            raise SystemExit(1)
+
+        retrieval_results = results_out["stage4"]["performance_comparison"]["retrieval_results"]
+        if not retrieval_results:
+            print("❌ No retrieval results found from Stage 4")
+            raise SystemExit(1)
+
+        print(f"📁 Using {len(retrieval_results)} retrieval results from Stage 4")
+
+        # Load LLM
+        try:
+            if args.llm_backend == "hf":
+                tokenizer, model = load_hf_model(args.model_name)
+
+                def llm_func(prompt):
+                    return generate_hf(tokenizer, model, prompt)
+            else:  # vllm
+                llm, sampling_params = load_vllm_model(args.model_name)
+
+                def llm_func(prompt):
+                    return generate_vllm(llm, sampling_params, prompt)
+
+            # Run generation using stored retrieval results
+            import time
+
+            from llm_utils import create_prompt
+
+            generation_times = []
+            responses = []
+
+            print("🤖 Running generation on pre-retrieved results...")
+            for i, item in enumerate(retrieval_results):
+                query = item["query"]
+                retrieved_docs = item["retrieved_docs"]
+
+                # Prepare context from retrieved docs
+                context = "\n\n".join([doc["text"] for doc in retrieved_docs])
+                prompt = create_prompt(context, query, "emails")
+
+                # Time generation only
+                gen_start = time.time()
+                response = llm_func(prompt)
+                gen_time = time.time() - gen_start
+
+                generation_times.append(gen_time)
+                responses.append(response)
+
+                if i < 3:
+                    print(f"  Q{i + 1}: Gen={gen_time:.3f}s")
+
+            avg_gen_time = sum(generation_times) / len(generation_times)
+
+            print("\n📊 Generation Results:")
+            print(f"  Total Queries: {len(retrieval_results)}")
+            print(f"  Avg Generation Time: {avg_gen_time:.3f}s")
+            print("  (Search time from Stage 4)")
+
+            results_out["stage5"] = {
+                "total_queries": len(retrieval_results),
+                "avg_generation_time": avg_gen_time,
+                "generation_times": generation_times,
+                "responses": responses,
+            }
+
+            # Show sample results
+            print("\n📝 Sample Results:")
+            for i in range(min(3, len(retrieval_results))):
+                query = retrieval_results[i]["query"]
+                response = responses[i]
+                print(f"  Q{i + 1}: {query[:60]}...")
+                print(f"  A{i + 1}: {response[:100]}...")
+                print()
+
+        except Exception as e:
+            print(f"❌ Generation evaluation failed: {e}")
+            print("💡 Make sure transformers/vllm is installed and model is available")
+
+        print("✅ Stage 5 completed!\n")
+
     if args.output and results_out:
         with open(args.output, "w", encoding="utf-8") as f:
             json.dump(results_out, f, indent=2)
diff --git a/benchmarks/financebench/README.md b/benchmarks/financebench/README.md
index 5683c9e..aefe9d6 100644
--- a/benchmarks/financebench/README.md
+++ b/benchmarks/financebench/README.md
@@ -45,9 +45,9 @@ This will:
 # Basic retrieval evaluation
 python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann
 
-# Include QA evaluation with OpenAI
-export OPENAI_API_KEY="your-key"
-python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --qa-samples 20
+
+# RAG generation evaluation with Qwen3-8B
+python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --stage 4 --complexity 64 --llm-backend hf --model-name Qwen/Qwen3-8B --output results_qwen3.json
 ```
 
 ## Evaluation Methods
@@ -85,6 +85,24 @@ LLM-based answer evaluation using GPT-4o:
 
 *Note: Number match rate >100% indicates multiple retrieved documents contain the same financial figures, which is expected behavior for financial data appearing across multiple document sections.
 
+### LEANN-RAG Generation Performance (Qwen3-8B)
+
+- **Stage 4 (Index Comparison):**
+  - Compact Index: 5.0 MB
+  - Non-compact Index: 172.2 MB
+  - **Storage Saving**: 97.1%
+- **Search Performance**:
+  - Non-compact (no recompute): 0.009s avg per query
+  - Compact (with recompute): 2.203s avg per query
+  - Speed ratio: 0.004x
+
+**Generation Evaluation (20 queries, complexity=64):**
+- **Average Search Time**: 1.638s per query
+- **Average Generation Time**: 45.957s per query
+- **LLM Backend**: HuggingFace transformers
+- **Model**: Qwen/Qwen3-8B (thinking model with <think></think> processing)
+- **Total Questions Processed**: 20
+
 ## Options
 
 ```bash
diff --git a/benchmarks/financebench/evaluate_financebench.py b/benchmarks/financebench/evaluate_financebench.py
index 77758c9..803d57e 100755
--- a/benchmarks/financebench/evaluate_financebench.py
+++ b/benchmarks/financebench/evaluate_financebench.py
@@ -4,20 +4,25 @@ FinanceBench Evaluation Script - Modular Recall-based Evaluation
 
 import argparse
 import json
+import logging
 import os
 import pickle
 import time
+from pathlib import Path
 from typing import Optional
 
 import numpy as np
 import openai
-
-# Import LEANN modules - this will bring in the modified faiss
 from leann import LeannChat, LeannSearcher
-
-# Import LEANN's modified faiss directly
 from leann_backend_hnsw import faiss
 
+from ..llm_utils import evaluate_rag, generate_hf, generate_vllm, load_hf_model, load_vllm_model
+
+# Setup logging to reduce verbose output
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("leann.api").setLevel(logging.WARNING)
+logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING)
+
 
 class RecallEvaluator:
     """Stage 2: Evaluate Recall@3 (searcher vs baseline)"""
@@ -125,7 +130,6 @@ class FinanceBenchEvaluator:
 
     def analyze_index_sizes(self) -> dict:
         """Analyze index sizes with and without embeddings"""
-        from pathlib import Path
 
         print("📏 Analyzing index sizes...")
 
@@ -136,7 +140,6 @@ class FinanceBenchEvaluator:
 
         sizes = {}
         total_with_embeddings = 0
-        total_without_embeddings = 0
 
         # Core index files
         index_file = index_dir / f"{index_name}.index"
@@ -155,28 +158,14 @@ class FinanceBenchEvaluator:
                 sizes[name] = size_mb
                 total_with_embeddings += size_mb
 
-                # For pruned index calculation, exclude the main index file (contains embeddings)
-                if name != "index":
-                    total_without_embeddings += size_mb
             else:
                 sizes[name] = 0
 
-        # Estimate pruned index size (approximate)
-        # When embeddings are removed, the main index file becomes much smaller
-        # Rough estimate: graph structure is ~10-20% of full index size
-        estimated_pruned_index_size = sizes["index"] * 0.15  # Conservative estimate
-        total_without_embeddings += estimated_pruned_index_size
-
         sizes["total_with_embeddings"] = total_with_embeddings
-        sizes["total_without_embeddings"] = total_without_embeddings
-        sizes["estimated_pruned_index"] = estimated_pruned_index_size
-        sizes["compression_ratio"] = (
-            total_without_embeddings / total_with_embeddings if total_with_embeddings > 0 else 0
-        )
+        sizes["index_only_mb"] = sizes["index"]  # Just the .index file for fair comparison
 
-        print(f"  📁 Index with embeddings: {total_with_embeddings:.1f} MB")
-        print(f"  📁 Estimated pruned index: {total_without_embeddings:.1f} MB")
-        print(f"  🗜️  Compression ratio: {sizes['compression_ratio']:.2f}x")
+        print(f"  📁 Total index size: {total_with_embeddings:.1f} MB")
+        print(f"  📁 Index file only: {sizes['index']:.1f} MB")
 
         return sizes
 
@@ -185,7 +174,6 @@ class FinanceBenchEvaluator:
         print("🏗️ Building compact index from existing passages...")
 
         # Load existing passages from current index
-        from pathlib import Path
 
         from leann import LeannBuilder
 
@@ -241,7 +229,6 @@ class FinanceBenchEvaluator:
         print("🏗️ Building non-compact index from existing passages...")
 
         # Load existing passages from current index
-        from pathlib import Path
 
         from leann import LeannBuilder
 
@@ -555,13 +542,7 @@ Respond with exactly one word: "CORRECT" if the generated answer is factually ac
         # Legacy single index analysis (fallback)
         if "total_with_embeddings" in timing_metrics and "current_index" not in timing_metrics:
             print("\n📏 Index Size Analysis:")
-            print(
-                f"  Index with embeddings: {timing_metrics.get('total_with_embeddings', 0):.1f} MB"
-            )
-            print(
-                f"  Estimated pruned index: {timing_metrics.get('total_without_embeddings', 0):.1f} MB"
-            )
-            print(f"  Compression ratio: {timing_metrics.get('compression_ratio', 0):.2f}x")
+            print(f"  Total index size: {timing_metrics.get('total_with_embeddings', 0):.1f} MB")
 
         print("\n📊 Accuracy:")
         print(f"  Accuracy: {timing_metrics.get('accuracy', 0) * 100:.1f}%")
@@ -610,6 +591,10 @@ def main():
     parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory")
     parser.add_argument("--openai-api-key", help="OpenAI API key for generation evaluation")
     parser.add_argument("--output", help="Save results to JSON file")
+    parser.add_argument(
+        "--llm-backend", choices=["openai", "hf", "vllm"], default="openai", help="LLM backend"
+    )
+    parser.add_argument("--model-name", default="Qwen3-8B", help="Model name for HF/vLLM")
 
     args = parser.parse_args()
 
@@ -768,7 +753,9 @@ def main():
             print("🚀 Starting Stage 4: Comprehensive evaluation with dual index comparison")
 
             # Use FinanceBench evaluator for QA evaluation
-            evaluator = FinanceBenchEvaluator(args.index, args.openai_api_key)
+            evaluator = FinanceBenchEvaluator(
+                args.index, args.openai_api_key if args.llm_backend == "openai" else None
+            )
 
             print("📖 Loading FinanceBench dataset...")
             data = evaluator.load_dataset(args.dataset)
@@ -802,20 +789,13 @@ def main():
             print(
                 f"  Non-compact index: {non_compact_size_metrics['total_with_embeddings']:.1f} MB"
             )
-            _ = (
-                (
-                    non_compact_size_metrics["total_with_embeddings"]
-                    - compact_size_metrics["total_with_embeddings"]
-                )
-                / compact_size_metrics["total_with_embeddings"]
-                * 100
-            )
+            print("\n📊 Index-only size comparison (.index file only):")
+            print(f"  Compact index: {compact_size_metrics['index_only_mb']:.1f} MB")
+            print(f"  Non-compact index: {non_compact_size_metrics['index_only_mb']:.1f} MB")
+            # Use index-only size for fair comparison (same as Enron emails)
             storage_saving = (
-                (
-                    non_compact_size_metrics["total_with_embeddings"]
-                    - compact_size_metrics["total_with_embeddings"]
-                )
-                / non_compact_size_metrics["total_with_embeddings"]
+                (non_compact_size_metrics["index_only_mb"] - compact_size_metrics["index_only_mb"])
+                / non_compact_size_metrics["index_only_mb"]
                 * 100
             )
             print(f"  Storage saving by compact: {storage_saving:.1f}%")
@@ -829,15 +809,58 @@ def main():
                 non_compact_index_path, args.index, data[:10], complexity=complexity
             )
 
-            # Step 5: Timing breakdown evaluation WITH recompute (production mode)
+            # Step 5: Generation evaluation
             test_samples = 20
-            print(f"\n🧪 Testing with first {test_samples} samples for timing analysis")
-            print(
-                "\n🔍🤖 Running timing breakdown evaluation (WITH recompute - production mode)..."
-            )
-            evaluation_start = time.time()
-            timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples])
-            evaluation_time = time.time() - evaluation_start
+            print(f"\n🧪 Testing with first {test_samples} samples for generation analysis")
+
+            if args.llm_backend == "openai" and args.openai_api_key:
+                print("🔍🤖 Running OpenAI-based generation evaluation...")
+                evaluation_start = time.time()
+                timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples])
+                evaluation_time = time.time() - evaluation_start
+            else:
+                print(
+                    f"🔍🤖 Running {args.llm_backend} generation evaluation with {args.model_name}..."
+                )
+                try:
+                    # Load LLM
+                    if args.llm_backend == "hf":
+                        tokenizer, model = load_hf_model(args.model_name)
+
+                        def llm_func(prompt):
+                            return generate_hf(tokenizer, model, prompt)
+                    else:  # vllm
+                        llm, sampling_params = load_vllm_model(args.model_name)
+
+                        def llm_func(prompt):
+                            return generate_vllm(llm, sampling_params, prompt)
+
+                    # Simple generation evaluation
+                    queries = [item["question"] for item in data[:test_samples]]
+                    gen_results = evaluate_rag(
+                        evaluator.searcher,
+                        llm_func,
+                        queries,
+                        domain="finance",
+                        complexity=complexity,
+                    )
+
+                    timing_metrics = {
+                        "total_questions": len(queries),
+                        "avg_search_time": gen_results["avg_search_time"],
+                        "avg_generation_time": gen_results["avg_generation_time"],
+                        "results": gen_results["results"],
+                    }
+                    evaluation_time = time.time()
+
+                except Exception as e:
+                    print(f"❌ Generation evaluation failed: {e}")
+                    timing_metrics = {
+                        "total_questions": 0,
+                        "avg_search_time": 0,
+                        "avg_generation_time": 0,
+                    }
+                    evaluation_time = 0
 
             # Combine all metrics
             combined_metrics = {
@@ -849,8 +872,11 @@ def main():
                 "storage_saving_percent": storage_saving,
             }
 
-            # Print comprehensive results
-            evaluator._print_results(combined_metrics)
+            # Print results
+            print("\n📊 Generation Results:")
+            print(f"  Total Questions: {timing_metrics.get('total_questions', 0)}")
+            print(f"  Avg Search Time: {timing_metrics.get('avg_search_time', 0):.3f}s")
+            print(f"  Avg Generation Time: {timing_metrics.get('avg_generation_time', 0):.3f}s")
 
             # Save results if requested
             if args.output:
diff --git a/benchmarks/laion/README.md b/benchmarks/laion/README.md
index 516f347..51dc721 100644
--- a/benchmarks/laion/README.md
+++ b/benchmarks/laion/README.md
@@ -1,6 +1,6 @@
 # LAION Multimodal Benchmark
 
-A multimodal benchmark for evaluating image retrieval performance using LEANN with CLIP embeddings on LAION dataset subset.
+A multimodal benchmark for evaluating image retrieval and generation performance using LEANN with CLIP embeddings and Qwen2.5-VL for multimodal generation on LAION dataset subset.
 
 ## Overview
 
@@ -9,6 +9,7 @@ This benchmark evaluates:
 - **Recall@K performance** for image search
 - **Complexity analysis** across different search parameters
 - **Index size and storage efficiency**
+- **Multimodal generation** with Qwen2.5-VL for image understanding and description
 
 ## Dataset Configuration
 
@@ -39,9 +40,13 @@ This will:
 python evaluate_laion.py --index data/laion_index.leann
 
 # Run specific stages
-python evaluate_laion.py --index data/laion_index.leann --stage timing
-python evaluate_laion.py --index data/laion_index.leann --stage recall
-python evaluate_laion.py --index data/laion_index.leann --stage complexity
+python evaluate_laion.py --index data/laion_index.leann --stage 2  # Recall evaluation
+python evaluate_laion.py --index data/laion_index.leann --stage 3  # Complexity analysis
+python evaluate_laion.py --index data/laion_index.leann --stage 4  # Index comparison
+python evaluate_laion.py --index data/laion_index.leann --stage 5  # Multimodal generation
+
+# Multimodal generation with Qwen2.5-VL
+python evaluate_laion.py --index data/laion_index.leann --stage 5 --model-name Qwen/Qwen2.5-VL-7B-Instruct
 ```
 
 ### 3. Save results
@@ -74,23 +79,26 @@ python evaluate_laion.py \
 
 ## Evaluation Stages
 
-### Stage 1: Index Analysis
-- Analyzes index file sizes and metadata
-- Reports storage efficiency
-
-### Stage 2: Search Timing
-- Measures average search latency
-- Tests with configurable complexity and top-k
-- Reports searches per second
-
-### Stage 3: Recall Evaluation
-- Evaluates Recall@K using ground truth
+### Stage 2: Recall Evaluation
+- Evaluates Recall@3 for multimodal retrieval
+- Compares LEANN vs FAISS baseline performance
 - Self-recall: query caption should retrieve original image
 
-### Stage 4: Complexity Analysis
-- Tests performance across different complexity levels [16, 32, 64, 128]
+### Stage 3: Complexity Analysis
+- Binary search for optimal complexity (90% recall target)
+- Tests performance across different complexity levels
 - Analyzes speed vs. accuracy tradeoffs
 
+### Stage 4: Index Comparison
+- Compares compact vs non-compact index sizes
+- Measures search performance differences
+- Reports storage efficiency and speed ratios
+
+### Stage 5: Multimodal Generation
+- Uses Qwen2.5-VL for image understanding and description
+- Retrieval-Augmented Generation (RAG) with multimodal context
+- Measures both search and generation timing
+
 ## Output Metrics
 
 ### Timing Metrics
@@ -100,48 +108,70 @@ python evaluate_laion.py \
 - Latency in milliseconds
 
 ### Recall Metrics
-- Recall@K percentage
+- Recall@3 percentage for image retrieval
 - Number of queries with ground truth
 
 ### Index Metrics
 - Total index size (MB)
 - Component breakdown (index, passages, metadata)
+- Storage savings (compact vs non-compact)
 - Backend and embedding model info
 
-## Example Results
+### Generation Metrics (Stage 5)
+- Average search time per query
+- Average generation time per query
+- Time distribution (search vs generation)
+- Sample multimodal responses
+- Model: Qwen2.5-VL performance
+
+## Benchmark Results
+
+### LEANN-RAG Performance (CLIP ViT-L/14 + Qwen2.5-VL)
+
+**Stage 3: Optimal Complexity Analysis**
+- **Optimal Complexity**: 85 (achieving 90% Recall@3)
+- **Binary Search Range**: 1-128
+- **Target Recall**: 90%
+- **Index Type**: Non-compact (for fast binary search)
+
+**Stage 5: Multimodal Generation Performance (Qwen2.5-VL)**
+- **Total Queries**: 20
+- **Average Search Time**: 1.200s per query
+- **Average Generation Time**: 6.558s per query
+- **Time Distribution**: Search 15.5%, Generation 84.5%
+- **LLM Backend**: HuggingFace transformers
+- **Model**: Qwen/Qwen2.5-VL-7B-Instruct
+- **Optimal Complexity**: 85
+
+**System Performance:**
+- **Index Size**: ~10,000 image embeddings from LAION subset
+- **Embedding Model**: CLIP ViT-L/14 (768 dimensions)
+- **Backend**: HNSW with cosine distance
+
+### Example Results
 
 ```
 🎯 LAION MULTIMODAL BENCHMARK RESULTS
 ============================================================
 
-📏 Index Information:
-  Total size: 145.2 MB
-  Backend: hnsw
-  Embedding model: clip-vit-b-32
-  Total passages: 10000
+📊 Multimodal Generation Results:
+  Total Queries: 20
+  Avg Search Time: 1.200s
+  Avg Generation Time: 6.558s
+  Time Distribution: Search 15.5%, Generation 84.5%
+  LLM Backend: HuggingFace transformers
+  Model: Qwen/Qwen2.5-VL-7B-Instruct
 
-⚡ Search Performance:
-  Total queries: 200
-  Average search time: 0.023s
-  Median search time: 0.021s
-  Min/Max search time: 0.012s / 0.089s
-  Std dev: 0.008s
-  Complexity: 64
-  Top-K: 3
-
-📊 Recall Performance:
-  Recall@3: 85.5%
-  Queries with ground truth: 200
-
-⚙️ Complexity Analysis:
-  Complexity  16: 0.015s avg
-  Complexity  32: 0.019s avg
-  Complexity  64: 0.023s avg
-  Complexity 128: 0.031s avg
+⚙️ Optimal Complexity Analysis:
+  Target Recall: 90%
+  Optimal Complexity: 85
+  Binary Search Range: 1-128
+  Non-compact Index (fast search, no recompute)
 
 🚀 Performance Summary:
-  Searches per second: 43.5
-  Latency (ms): 23.0ms
+  Multimodal RAG: 7.758s total per query
+  Search: 15.5% of total time
+  Generation: 84.5% of total time
 ```
 
 ## Directory Structure
diff --git a/benchmarks/laion/evaluate_laion.py b/benchmarks/laion/evaluate_laion.py
index 1383ae5..dd30635 100644
--- a/benchmarks/laion/evaluate_laion.py
+++ b/benchmarks/laion/evaluate_laion.py
@@ -4,6 +4,7 @@ LAION Multimodal Benchmark Evaluation Script - Modular Recall-based Evaluation
 
 import argparse
 import json
+import logging
 import os
 import pickle
 import time
@@ -14,6 +15,13 @@ from leann import LeannSearcher
 from leann_backend_hnsw import faiss
 from sentence_transformers import SentenceTransformer
 
+from ..llm_utils import evaluate_multimodal_rag, load_qwen_vl_model
+
+# Setup logging to reduce verbose output
+logging.basicConfig(level=logging.WARNING)
+logging.getLogger("leann.api").setLevel(logging.WARNING)
+logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING)
+
 
 class RecallEvaluator:
     """Stage 2: Evaluate Recall@3 (LEANN vs FAISS baseline for multimodal retrieval)"""
@@ -388,13 +396,22 @@ def main():
     )
     parser.add_argument(
         "--stage",
-        choices=["2", "3", "4", "all"],
+        choices=["2", "3", "4", "5", "all"],
         default="all",
-        help="Which stage to run (2=recall, 3=complexity, 4=index comparison)",
+        help="Which stage to run (2=recall, 3=complexity, 4=index comparison, 5=generation)",
     )
     parser.add_argument("--complexity", type=int, default=None, help="Complexity for search")
     parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory")
     parser.add_argument("--output", help="Save results to JSON file")
+    parser.add_argument(
+        "--llm-backend",
+        choices=["hf"],
+        default="hf",
+        help="LLM backend (Qwen2.5-VL only supports HF)",
+    )
+    parser.add_argument(
+        "--model-name", default="Qwen/Qwen2.5-VL-7B-Instruct", help="Multimodal model name"
+    )
 
     args = parser.parse_args()
 
@@ -615,12 +632,69 @@ def main():
             evaluator.cleanup()
             print("✅ Stage 4 completed!\n")
 
+        if args.stage in ("5", "all"):
+            print("🚀 Starting Stage 5: Multimodal generation with Qwen2.5-VL")
+            evaluator = LAIONEvaluator(args.index)
+            captions = evaluator.load_queries(args.queries)
+            test_captions = captions[: min(20, len(captions))]  # Use subset for generation
+
+            print(f"🧪 Testing multimodal generation with {len(test_captions)} queries")
+
+            # Load Qwen2.5-VL model
+            try:
+                print("Loading Qwen2.5-VL model...")
+                processor, model = load_qwen_vl_model(args.model_name)
+
+                # Run multimodal generation evaluation
+                complexity = args.complexity or 64
+                gen_results = evaluate_multimodal_rag(
+                    evaluator.searcher,
+                    test_captions,
+                    processor=processor,
+                    model=model,
+                    complexity=complexity,
+                )
+
+                print("\n📊 Multimodal Generation Results:")
+                print(f"  Total Queries: {len(test_captions)}")
+                print(f"  Avg Search Time: {gen_results['avg_search_time']:.3f}s")
+                print(f"  Avg Generation Time: {gen_results['avg_generation_time']:.3f}s")
+                total_time = gen_results["avg_search_time"] + gen_results["avg_generation_time"]
+                search_pct = (gen_results["avg_search_time"] / total_time) * 100
+                gen_pct = (gen_results["avg_generation_time"] / total_time) * 100
+                print(f"  Time Distribution: Search {search_pct:.1f}%, Generation {gen_pct:.1f}%")
+                print("  LLM Backend: HuggingFace transformers")
+                print(f"  Model: {args.model_name}")
+
+                # Show sample results
+                print("\n📝 Sample Multimodal Generations:")
+                for i, response in enumerate(gen_results["results"][:3]):
+                    # Handle both string and dict formats for captions
+                    if isinstance(test_captions[i], dict):
+                        caption_text = test_captions[i].get("query", str(test_captions[i]))
+                    else:
+                        caption_text = str(test_captions[i])
+                    print(f"  Query {i + 1}: {caption_text[:60]}...")
+                    print(f"  Response {i + 1}: {response[:100]}...")
+                    print()
+
+            except Exception as e:
+                print(f"❌ Multimodal generation evaluation failed: {e}")
+                print("💡 Make sure transformers and Qwen2.5-VL are installed")
+                import traceback
+
+                traceback.print_exc()
+
+            evaluator.cleanup()
+            print("✅ Stage 5 completed!\n")
+
         if args.stage == "all":
             print("🎉 All evaluation stages completed successfully!")
             print("\n📋 Summary:")
             print("  Stage 2: ✅ Multimodal Recall@3 evaluation completed")
             print("  Stage 3: ✅ Optimal complexity found")
             print("  Stage 4: ✅ Index comparison analysis completed")
+            print("  Stage 5: ✅ Multimodal generation evaluation completed")
             print("\n🔧 Recommended next steps:")
             print("  - Use optimal complexity for best speed/accuracy balance")
             print("  - Review index comparison for storage vs performance tradeoffs")
diff --git a/benchmarks/llm_utils.py b/benchmarks/llm_utils.py
new file mode 100644
index 0000000..9a8217c
--- /dev/null
+++ b/benchmarks/llm_utils.py
@@ -0,0 +1,301 @@
+"""
+LLM utils for RAG benchmarks with Qwen3-8B and Qwen2.5-VL (multimodal)
+"""
+
+import time
+
+try:
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+
+try:
+    from vllm import LLM, SamplingParams
+
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+
+
+def is_qwen3_model(model_name):
+    """Check if model is Qwen3"""
+    return "Qwen3" in model_name or "qwen3" in model_name.lower()
+
+
+def is_qwen_vl_model(model_name):
+    """Check if model is Qwen2.5-VL"""
+    return "Qwen2.5-VL" in model_name or "qwen2.5-vl" in model_name.lower()
+
+
+def apply_qwen3_chat_template(tokenizer, prompt):
+    """Apply Qwen3 chat template with thinking enabled"""
+    messages = [{"role": "user", "content": prompt}]
+    return tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True,
+    )
+
+
+def extract_thinking_answer(response):
+    """Extract final answer from Qwen3 thinking model response"""
+    if "<think>" in response and "</think>" in response:
+        try:
+            think_end = response.index("</think>") + len("</think>")
+            final_answer = response[think_end:].strip()
+            return final_answer
+        except (ValueError, IndexError):
+            pass
+
+    return response.strip()
+
+
+def load_hf_model(model_name="Qwen/Qwen3-8B"):
+    """Load HuggingFace model"""
+    if not HF_AVAILABLE:
+        raise ImportError("transformers not available")
+
+    print(f"Loading HF: {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    return tokenizer, model
+
+
+def load_vllm_model(model_name="Qwen/Qwen3-8B"):
+    """Load vLLM model"""
+    if not VLLM_AVAILABLE:
+        raise ImportError("vllm not available")
+
+    print(f"Loading vLLM: {model_name}")
+    llm = LLM(model=model_name, trust_remote_code=True)
+
+    # Qwen3 specific config
+    if is_qwen3_model(model_name):
+        stop_tokens = ["<|im_end|>", "<|end_of_text|>"]
+        max_tokens = 2048
+    else:
+        stop_tokens = None
+        max_tokens = 1024
+
+    sampling_params = SamplingParams(temperature=0.7, max_tokens=max_tokens, stop=stop_tokens)
+    return llm, sampling_params
+
+
+def generate_hf(tokenizer, model, prompt, max_tokens=None):
+    """Generate with HF - supports Qwen3 thinking models"""
+    model_name = getattr(model, "name_or_path", "unknown")
+    is_qwen3 = is_qwen3_model(model_name)
+
+    # Apply chat template for Qwen3
+    if is_qwen3:
+        prompt = apply_qwen3_chat_template(tokenizer, prompt)
+        max_tokens = max_tokens or 2048
+    else:
+        max_tokens = max_tokens or 1024
+
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=0.7,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response = response[len(prompt) :].strip()
+
+    # Extract final answer for thinking models
+    if is_qwen3:
+        return extract_thinking_answer(response)
+    return response
+
+
+def generate_vllm(llm, sampling_params, prompt):
+    """Generate with vLLM - supports Qwen3 thinking models"""
+    outputs = llm.generate([prompt], sampling_params)
+    response = outputs[0].outputs[0].text.strip()
+
+    # Extract final answer for Qwen3 thinking models
+    model_name = str(llm.llm_engine.model_config.model)
+    if is_qwen3_model(model_name):
+        return extract_thinking_answer(response)
+    return response
+
+
+def create_prompt(context, query, domain="default"):
+    """Create RAG prompt"""
+    if domain == "emails":
+        return f"Email content:\n{context}\n\nQuestion: {query}\n\nAnswer:"
+    elif domain == "finance":
+        return f"Financial content:\n{context}\n\nQuestion: {query}\n\nAnswer:"
+    elif domain == "multimodal":
+        return f"Image context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
+    else:
+        return f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
+
+
+def evaluate_rag(searcher, llm_func, queries, domain="default", top_k=3, complexity=64):
+    """Simple RAG evaluation with timing"""
+    search_times = []
+    gen_times = []
+    results = []
+
+    for i, query in enumerate(queries):
+        # Search
+        start = time.time()
+        docs = searcher.search(query, top_k=top_k, complexity=complexity)
+        search_time = time.time() - start
+
+        # Generate
+        context = "\n\n".join([doc.text for doc in docs])
+        prompt = create_prompt(context, query, domain)
+
+        start = time.time()
+        response = llm_func(prompt)
+        gen_time = time.time() - start
+
+        search_times.append(search_time)
+        gen_times.append(gen_time)
+        results.append(response)
+
+        if i < 3:
+            print(f"Q{i + 1}: Search={search_time:.3f}s, Gen={gen_time:.3f}s")
+
+    return {
+        "avg_search_time": sum(search_times) / len(search_times),
+        "avg_generation_time": sum(gen_times) / len(gen_times),
+        "results": results,
+    }
+
+
+def load_qwen_vl_model(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
+    """Load Qwen2.5-VL multimodal model"""
+    if not HF_AVAILABLE:
+        raise ImportError("transformers not available")
+
+    print(f"Loading Qwen2.5-VL: {model_name}")
+
+    try:
+        from transformers import AutoModelForVision2Seq, AutoProcessor
+
+        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
+        )
+
+        return processor, model
+
+    except Exception as e:
+        print(f"Failed to load with AutoModelForVision2Seq, trying specific class: {e}")
+
+        # Fallback to specific class
+        try:
+            from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+
+            processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+            model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
+            )
+
+            return processor, model
+
+        except Exception as e2:
+            raise ImportError(f"Failed to load Qwen2.5-VL model: {e2}")
+
+
+def generate_qwen_vl(processor, model, prompt, image_path=None, max_tokens=512):
+    """Generate with Qwen2.5-VL multimodal model"""
+    from PIL import Image
+
+    # Prepare inputs
+    if image_path:
+        image = Image.open(image_path)
+        inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+    else:
+        inputs = processor(text=prompt, return_tensors="pt").to(model.device)
+
+    # Generate
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=max_tokens, do_sample=False, temperature=0.1
+        )
+
+    # Decode response
+    generated_ids = generated_ids[:, inputs["input_ids"].shape[1] :]
+    response = processor.decode(generated_ids[0], skip_special_tokens=True)
+
+    return response
+
+
+def create_multimodal_prompt(context, query, image_descriptions, task_type="images"):
+    """Create prompt for multimodal RAG"""
+    if task_type == "images":
+        return f"""Based on the retrieved images and their descriptions, answer the following question.
+
+Retrieved Image Descriptions:
+{context}
+
+Question: {query}
+
+Provide a detailed answer based on the visual content described above."""
+
+    return f"Context: {context}\nQuestion: {query}\nAnswer:"
+
+
+def evaluate_multimodal_rag(searcher, queries, processor=None, model=None, complexity=64):
+    """Evaluate multimodal RAG with Qwen2.5-VL"""
+    search_times = []
+    gen_times = []
+    results = []
+
+    for i, query_item in enumerate(queries):
+        # Handle both string and dict formats for queries
+        if isinstance(query_item, dict):
+            query = query_item.get("query", "")
+            image_path = query_item.get("image_path")  # Optional reference image
+        else:
+            query = str(query_item)
+            image_path = None
+
+        # Search
+        start_time = time.time()
+        search_results = searcher.search(query, top_k=3, complexity=complexity)
+        search_time = time.time() - start_time
+        search_times.append(search_time)
+
+        # Prepare context from search results
+        context_parts = []
+        for result in search_results:
+            context_parts.append(f"- {result.text}")
+        context = "\n".join(context_parts)
+
+        # Generate with multimodal model
+        start_time = time.time()
+        if processor and model:
+            prompt = create_multimodal_prompt(context, query, context_parts)
+            response = generate_qwen_vl(processor, model, prompt, image_path)
+        else:
+            response = f"Context: {context}"
+        gen_time = time.time() - start_time
+
+        gen_times.append(gen_time)
+        results.append(response)
+
+        if i < 3:
+            print(f"Q{i + 1}: Search={search_time:.3f}s, Gen={gen_time:.3f}s")
+
+    return {
+        "avg_search_time": sum(search_times) / len(search_times),
+        "avg_generation_time": sum(gen_times) / len(gen_times),
+        "results": results,
+    }
diff --git a/pyproject.toml b/pyproject.toml
index d738017..35e5613 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,7 @@ dependencies = [
     "tree-sitter-java>=0.20.0",
     "tree-sitter-c-sharp>=0.20.0",
     "tree-sitter-typescript>=0.20.0",
+    "torchvision>=0.23.0",
 ]
 
 [project.optional-dependencies]
diff --git a/uv.lock b/uv.lock
index d4fa777..d01612b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1564,7 +1564,7 @@ name = "importlib-metadata"
 version = "8.7.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "zipp" },
+    { name = "zipp", marker = "python_full_version < '3.10'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" }
 wheels = [
@@ -2257,6 +2257,7 @@ dependencies = [
     { name = "sentence-transformers" },
     { name = "sglang" },
     { name = "torch" },
+    { name = "torchvision" },
     { name = "tqdm" },
     { name = "tree-sitter", version = "0.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "tree-sitter", version = "0.25.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -2346,6 +2347,7 @@ requires-dist = [
     { name = "sentence-transformers", specifier = ">=2.2.0" },
     { name = "sglang" },
     { name = "torch" },
+    { name = "torchvision", specifier = ">=0.23.0" },
     { name = "tqdm" },
     { name = "tree-sitter", specifier = ">=0.20.0" },
     { name = "tree-sitter-c-sharp", specifier = ">=0.20.0" },
@@ -5870,6 +5872,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" },
 ]
 
+[[package]]
+name = "torchvision"
+version = "0.23.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+    { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow" },
+    { name = "torch" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/49/5ad5c3ff4920be0adee9eb4339b4fb3b023a0fc55b9ed8dbc73df92946b8/torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7266871daca00ad46d1c073e55d972179d12a58fa5c9adec9a3db9bbed71284a", size = 1856885, upload-time = "2025-08-06T14:57:55.024Z" },
+    { url = "https://files.pythonhosted.org/packages/25/44/ddd56d1637bac42a8c5da2c8c440d8a28c431f996dd9790f32dd9a96ca6e/torchvision-0.23.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31c583ba27426a3a04eca8c05450524105c1564db41be6632f7536ef405a6de2", size = 2394251, upload-time = "2025-08-06T14:58:01.725Z" },
+    { url = "https://files.pythonhosted.org/packages/93/f3/3cdf55bbf0f737304d997561c34ab0176222e0496b6743b0feab5995182c/torchvision-0.23.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3932bf67256f2d095ce90a9f826f6033694c818856f4bb26794cf2ce64253e53", size = 8627497, upload-time = "2025-08-06T14:58:09.317Z" },
+    { url = "https://files.pythonhosted.org/packages/97/90/02afe57c3ef4284c5cf89d3b7ae203829b3a981f72b93a7dd2a3fd2c83c1/torchvision-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:83ee5bf827d61a8af14620c0a61d8608558638ac9c3bac8adb7b27138e2147d1", size = 1600760, upload-time = "2025-08-06T14:57:56.783Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/d7/15d3d7bd8d0239211b21673d1bac7bc345a4ad904a8e25bb3fd8a9cf1fbc/torchvision-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49aa20e21f0c2bd458c71d7b449776cbd5f16693dd5807195a820612b8a229b7", size = 1856884, upload-time = "2025-08-06T14:58:00.237Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/14/7b44fe766b7d11e064c539d92a172fa9689a53b69029e24f2f1f51e7dc56/torchvision-0.23.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01dc33ee24c79148aee7cdbcf34ae8a3c9da1674a591e781577b716d233b1fa6", size = 2395543, upload-time = "2025-08-06T14:58:04.373Z" },
+    { url = "https://files.pythonhosted.org/packages/79/9c/fcb09aff941c8147d9e6aa6c8f67412a05622b0c750bcf796be4c85a58d4/torchvision-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35c27941831b653f5101edfe62c03d196c13f32139310519e8228f35eae0e96a", size = 8628388, upload-time = "2025-08-06T14:58:07.802Z" },
+    { url = "https://files.pythonhosted.org/packages/93/40/3415d890eb357b25a8e0a215d32365a88ecc75a283f75c4e919024b22d97/torchvision-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:09bfde260e7963a15b80c9e442faa9f021c7e7f877ac0a36ca6561b367185013", size = 1600741, upload-time = "2025-08-06T14:57:59.158Z" },
+    { url = "https://files.pythonhosted.org/packages/df/1d/0ea0b34bde92a86d42620f29baa6dcbb5c2fc85990316df5cb8f7abb8ea2/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440", size = 1856885, upload-time = "2025-08-06T14:58:06.503Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/00/2f6454decc0cd67158c7890364e446aad4b91797087a57a78e72e1a8f8bc/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6dd7c4d329a0e03157803031bc856220c6155ef08c26d4f5bbac938acecf0948", size = 2396614, upload-time = "2025-08-06T14:58:03.116Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/b5/3e580dcbc16f39a324f3dd71b90edbf02a42548ad44d2b4893cc92b1194b/torchvision-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4e7d31c43bc7cbecbb1a5652ac0106b436aa66e26437585fc2c4b2cf04d6014c", size = 8627108, upload-time = "2025-08-06T14:58:12.956Z" },
+    { url = "https://files.pythonhosted.org/packages/82/c1/c2fe6d61e110a8d0de2f94276899a2324a8f1e6aee559eb6b4629ab27466/torchvision-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:a2e45272abe7b8bf0d06c405e78521b5757be1bd0ed7e5cd78120f7fdd4cbf35", size = 1600723, upload-time = "2025-08-06T14:57:57.986Z" },
+    { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/da/a06c60fc84fc849377cf035d3b3e9a1c896d52dbad493b963c0f1cdd74d0/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d", size = 2353112, upload-time = "2025-08-06T14:58:26.265Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/27/5ce65ba5c9d3b7d2ccdd79892ab86a2f87ac2ca6638f04bb0280321f1a9c/torchvision-0.23.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:a76fafe113b2977be3a21bf78f115438c1f88631d7a87203acb3dd6ae55889e6", size = 8627658, upload-time = "2025-08-06T14:58:15.999Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/e4/028a27b60aa578a2fa99d9d7334ff1871bb17008693ea055a2fdee96da0d/torchvision-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:07d069cb29691ff566e3b7f11f20d91044f079e1dbdc9d72e0655899a9b06938", size = 1600749, upload-time = "2025-08-06T14:58:10.719Z" },
+    { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/9d/406cea60a9eb9882145bcd62a184ee61e823e8e1d550cdc3c3ea866a9445/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b", size = 2359295, upload-time = "2025-08-06T14:58:17.469Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/f4/34662f71a70fa1e59de99772142f22257ca750de05ccb400b8d2e3809c1d/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:76bc4c0b63d5114aa81281390f8472a12a6a35ce9906e67ea6044e5af4cab60c", size = 8800474, upload-time = "2025-08-06T14:58:22.53Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/f5/b5a2d841a8d228b5dbda6d524704408e19e7ca6b7bb0f24490e081da1fa1/torchvision-0.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e2dabf0da9c8aa9ea241afb63a8f3e98489e706b22ac3f30416a1be377153b", size = 1527667, upload-time = "2025-08-06T14:58:14.446Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/3e/f1f3bb3dd452b98ec2eba4820d777440abceb3d3a428a6c8243006fe47e5/torchvision-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b190db205f90206c230fc2f91cbdfd5733334babc0e0d19bddb90a40b8cf26c2", size = 1856927, upload-time = "2025-08-06T14:58:18.919Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/e2/aafc6af854e792d212ff58e459f8d5d807568dc3f2b49ec41b677275e5a9/torchvision-0.23.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6c74cbc1cbee26dd4f35f989cd80dccc40411f258dee476b29871dee4b483af0", size = 2392870, upload-time = "2025-08-06T14:58:21.303Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/06/09b6a917b3759ef000428af0aa2597f983e20d9fbbcfeb826750f778fe6d/torchvision-0.23.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a9e9d7552d34547b80843eaf64ab0737b19b2e8bec2514286b8cfd30861ca8b5", size = 8630400, upload-time = "2025-08-06T14:58:24.139Z" },
+    { url = "https://files.pythonhosted.org/packages/08/07/ae46106efbf4bbc0090078aa4c406c38282cbe4e637bdb4b7f2e984140af/torchvision-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:dc7ce5accbbb8c9df9a79f8cef6a6df042f28e2250a6ae0d2ca70b06473fa03b", size = 1600751, upload-time = "2025-08-06T14:58:20.027Z" },
+]
+
 [[package]]
 name = "tornado"
 version = "6.5.2"