From a0d6857faabcc4749cc2172c7d4b3d9ebd1e6e66 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 15 Sep 2025 19:50:02 -0700 Subject: [PATCH] docs: data updated --- benchmarks/enron_emails/README.md | 67 ++-- .../enron_emails/evaluate_enron_emails.py | 113 ++++++- benchmarks/financebench/README.md | 24 +- .../financebench/evaluate_financebench.py | 138 ++++---- benchmarks/laion/README.md | 118 ++++--- benchmarks/laion/evaluate_laion.py | 78 ++++- benchmarks/llm_utils.py | 301 ++++++++++++++++++ pyproject.toml | 1 + uv.lock | 42 ++- 9 files changed, 749 insertions(+), 133 deletions(-) create mode 100644 benchmarks/llm_utils.py diff --git a/benchmarks/enron_emails/README.md b/benchmarks/enron_emails/README.md index 16d2271..fdeae69 100644 --- a/benchmarks/enron_emails/README.md +++ b/benchmarks/enron_emails/README.md @@ -1,18 +1,19 @@ # Enron Emails Benchmark -A retrieval-only benchmark for evaluating LEANN search on the Enron email corpus. It mirrors the structure and CLI of the existing FinanceBench and LAION benches, using stage-based evaluation focused on Recall@3. +A comprehensive RAG benchmark for evaluating LEANN search and generation on the Enron email corpus. It mirrors the structure and CLI of the existing FinanceBench and LAION benches, using stage-based evaluation with Recall@3 and generation timing. - Dataset: Enron email CSV (e.g., Kaggle wcukierski/enron-email-dataset) for passages - Queries: corbt/enron_emails_sample_questions (filtered for realistic questions) -- Metric: Recall@3 vs FAISS Flat baseline +- Metrics: Recall@3 vs FAISS Flat baseline + Generation evaluation with Qwen3-8B ## Layout benchmarks/enron_emails/ - setup_enron_emails.py: Prepare passages, build LEANN index, build FAISS baseline -- evaluate_enron_emails.py: Evaluate retrieval recall (Stage 2) +- evaluate_enron_emails.py: Evaluate retrieval recall (Stages 2-5) + generation with Qwen3-8B - data/: Generated passages, queries, embeddings-related files - baseline/: FAISS Flat baseline files +- llm_utils.py: LLM utilities for Qwen3-8B generation (in parent directory) ## Quickstart @@ -41,23 +42,33 @@ Stage 3 uses binary search over complexity to find the minimal value achieving t 4) Index comparison (Stage 4) -python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --max-queries 100 --output results.json +python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --complexity 88 --max-queries 100 --output results.json + +5) Generation evaluation (Stage 5) + +python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 5 --complexity 88 --llm-backend hf --model-name Qwen/Qwen3-8B + +6) Combined index + generation evaluation (Stages 4+5, recommended) + +python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 45 --complexity 88 --llm-backend hf Notes: - Minimal CLI: you can run from repo root with only `--index`, defaults match financebench/laion patterns: - - `--stage` defaults to `all` (runs 2, 3, 4) + - `--stage` defaults to `all` (runs 2, 3, 4, 5) - `--baseline-dir` defaults to `baseline` - `--queries` defaults to `data/evaluation_queries.jsonl` (or falls back to the index directory) + - `--llm-backend` defaults to `hf` (HuggingFace), can use `vllm` + - `--model-name` defaults to `Qwen/Qwen3-8B` - Fail-fast behavior: no silent fallbacks. If compact index cannot run with recompute, it errors out. - -4) Index comparison (Stage 4) - -python evaluate_enron_emails.py --index data/enron_index_hnsw.leann --stage 4 --max-queries 100 --output results.json +- Stage 5 requires Stage 4 retrieval results. Use `--stage 45` to run both efficiently. Optional flags: - --queries data/evaluation_queries.jsonl (custom queries file) - --baseline-dir baseline (where FAISS baseline lives) -- --complexity 64 (LEANN complexity parameter) +- --complexity 88 (LEANN complexity parameter, optimal for 90% recall) +- --llm-backend hf|vllm (LLM backend for generation) +- --model-name Qwen/Qwen3-8B (LLM model for generation) +- --max-queries 1000 (limit number of queries for evaluation) ## Files Produced - data/enron_passages_preview.jsonl: Small preview of passages used (for inspection) @@ -66,8 +77,9 @@ Optional flags: - data/evaluation_queries.jsonl: Query file (id + query; includes GT IDs for reference) ## Notes -- We only evaluate retrieval Recall@3 (no generation). This matches the other benches’ style and stage flow. +- Evaluates both retrieval Recall@3 and generation timing with Qwen3-8B thinking model. - The emails CSV must contain a column named "message" (raw RFC822 email) and a column named "file" for source identifier. Message-ID headers are parsed as canonical message IDs when present. +- Qwen3-8B requires special handling for thinking models with chat templates and tag processing. ## Stages Summary @@ -80,16 +92,23 @@ Optional flags: - Stage 4 (Index Comparison): - Reports .index-only sizes for compact vs non-compact. - - Measures timings on 100 queries by default: non-compact (no recompute) vs compact (with recompute). + - Measures timings on queries by default: non-compact (no recompute) vs compact (with recompute). + - Stores retrieval results for Stage 5 generation evaluation. - Fails fast if compact recompute cannot run. - If `--complexity` is not provided, the script tries to use the best complexity from Stage 3: - First from the current run (when running `--stage all`), otherwise - From `enron_stage3_results.json` saved next to the index during the last Stage 3 run. - If neither exists, Stage 4 will error and ask you to run Stage 3 or pass `--complexity`. +- Stage 5 (Generation Evaluation): + - Uses Qwen3-8B thinking model for RAG generation on retrieved documents from Stage 4. + - Supports HuggingFace (`hf`) and vLLM (`vllm`) backends. + - Measures generation timing separately from search timing. + - Requires Stage 4 results (no additional searching performed). + ## Example Results -These are sample results obtained on a subset of Enron data using all-mpnet-base-v2. +These are sample results obtained on Enron data using all-mpnet-base-v2 and Qwen3-8B. - Stage 3 (Binary Search): - Minimal complexity achieving 90% Recall@3: 88 @@ -103,14 +122,20 @@ These are sample results obtained on a subset of Enron data using all-mpnet-base - C=256 β†’ 92.0% Recall@3 - Stage 4 (Index Sizes, .index only): - - Compact: ~2.17 MB - - Non-compact: ~82.03 MB - - Storage saving by compact: ~97.35% + - Compact: ~2.2 MB + - Non-compact: ~82.0 MB + - Storage saving by compact: ~97.3% -- Stage 4 (Timing, 100 queries, complexity=88): - - Non-compact (no recompute): ~0.0074 s avg per query - - Compact (with recompute): ~1.947 s avg per query +- Stage 4 (Search Timing, 988 queries, complexity=88): + - Non-compact (no recompute): ~0.0075 s avg per query + - Compact (with recompute): ~1.981 s avg per query - Speed ratio (non-compact/compact): ~0.0038x -Full JSON output for Stage 4 is saved by the script (see `--output`), e.g.: -`benchmarks/enron_emails/results_enron_stage4.json`. +- Stage 5 (RAG Generation, 988 queries, Qwen3-8B): + - Average generation time: ~22.302 s per query + - Total queries processed: 988 + - LLM backend: HuggingFace transformers + - Model: Qwen/Qwen3-8B (thinking model with processing) + +Full JSON output is saved by the script (see `--output`), e.g.: +`benchmarks/enron_emails/results_enron_stage45.json`. diff --git a/benchmarks/enron_emails/evaluate_enron_emails.py b/benchmarks/enron_emails/evaluate_enron_emails.py index e8c3db0..c6e8518 100644 --- a/benchmarks/enron_emails/evaluate_enron_emails.py +++ b/benchmarks/enron_emails/evaluate_enron_emails.py @@ -7,13 +7,22 @@ On errors, fail fast without fallbacks. import argparse import json +import logging import os import pickle +from pathlib import Path import numpy as np from leann import LeannBuilder, LeannSearcher from leann_backend_hnsw import faiss +from ..llm_utils import generate_hf, generate_vllm, load_hf_model, load_vllm_model + +# Setup logging to reduce verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger("leann.api").setLevel(logging.WARNING) +logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING) + class RecallEvaluator: """Stage 2: Evaluate Recall@3 (LEANN vs FAISS)""" @@ -119,7 +128,6 @@ class EnronEvaluator: def analyze_index_sizes(self) -> dict: """Analyze index sizes (.index only), similar to LAION bench.""" - from pathlib import Path print("πŸ“ Analyzing index sizes (.index only)...") index_path = Path(self.index_path) @@ -150,7 +158,6 @@ class EnronEvaluator: def create_non_compact_index_for_comparison(self, non_compact_index_path: str) -> dict: """Create a non-compact index for comparison using current passages and embeddings.""" - from pathlib import Path current_index_path = Path(self.index_path) current_index_dir = current_index_path.parent @@ -230,6 +237,7 @@ class EnronEvaluator: "compact": {"search_times": []}, "avg_search_times": {}, "speed_ratio": 0.0, + "retrieval_results": [], # Store retrieval results for Stage 5 } print("⚑ Comparing search performance between indexes...") @@ -248,10 +256,15 @@ class EnronEvaluator: compact_searcher = LeannSearcher(compact_path) for q in test_queries: t0 = time.time() - _ = compact_searcher.search( + docs = compact_searcher.search( q, top_k=3, complexity=complexity, recompute_embeddings=True ) results["compact"]["search_times"].append(time.time() - t0) + + # Store retrieval results for Stage 5 + results["retrieval_results"].append( + {"query": q, "retrieved_docs": [{"id": doc.id, "text": doc.text} for doc in docs]} + ) compact_searcher.cleanup() if results["non_compact"]["search_times"]: @@ -358,9 +371,9 @@ def main(): ) parser.add_argument( "--stage", - choices=["2", "3", "4", "all"], + choices=["2", "3", "4", "5", "all", "45"], default="all", - help="Which stage to run (2=recall, 3=complexity, 4=index comparison)", + help="Which stage to run (2=recall, 3=complexity, 4=index comparison, 5=generation)", ) parser.add_argument("--complexity", type=int, default=None, help="LEANN search complexity") parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory") @@ -371,6 +384,8 @@ def main(): "--target-recall", type=float, default=0.90, help="Target Recall@3 for Stage 3" ) parser.add_argument("--output", help="Save results to JSON file") + parser.add_argument("--llm-backend", choices=["hf", "vllm"], default="hf", help="LLM backend") + parser.add_argument("--model-name", default="Qwen/Qwen3-8B", help="Model name") args = parser.parse_args() @@ -438,7 +453,7 @@ def main(): enron_eval.cleanup() print("βœ… Stage 3 completed!\n") - if args.stage in ("4", "all"): + if args.stage in ("4", "all", "45"): print("πŸš€ Starting Stage 4: Index size + performance comparison") evaluator = RecallEvaluator(args.index, args.baseline_dir) enron_eval = EnronEvaluator(args.index) @@ -503,6 +518,92 @@ def main(): enron_eval.cleanup() print("βœ… Stage 4 completed!\n") + if args.stage in ("5", "all"): + print("πŸš€ Starting Stage 5: Generation evaluation with Qwen3-8B") + + # Check if Stage 4 results exist + if "stage4" not in results_out or "performance_comparison" not in results_out["stage4"]: + print("❌ Stage 5 requires Stage 4 retrieval results") + print("πŸ’‘ Run Stage 4 first or use --stage all") + raise SystemExit(1) + + retrieval_results = results_out["stage4"]["performance_comparison"]["retrieval_results"] + if not retrieval_results: + print("❌ No retrieval results found from Stage 4") + raise SystemExit(1) + + print(f"πŸ“ Using {len(retrieval_results)} retrieval results from Stage 4") + + # Load LLM + try: + if args.llm_backend == "hf": + tokenizer, model = load_hf_model(args.model_name) + + def llm_func(prompt): + return generate_hf(tokenizer, model, prompt) + else: # vllm + llm, sampling_params = load_vllm_model(args.model_name) + + def llm_func(prompt): + return generate_vllm(llm, sampling_params, prompt) + + # Run generation using stored retrieval results + import time + + from llm_utils import create_prompt + + generation_times = [] + responses = [] + + print("πŸ€– Running generation on pre-retrieved results...") + for i, item in enumerate(retrieval_results): + query = item["query"] + retrieved_docs = item["retrieved_docs"] + + # Prepare context from retrieved docs + context = "\n\n".join([doc["text"] for doc in retrieved_docs]) + prompt = create_prompt(context, query, "emails") + + # Time generation only + gen_start = time.time() + response = llm_func(prompt) + gen_time = time.time() - gen_start + + generation_times.append(gen_time) + responses.append(response) + + if i < 3: + print(f" Q{i + 1}: Gen={gen_time:.3f}s") + + avg_gen_time = sum(generation_times) / len(generation_times) + + print("\nπŸ“Š Generation Results:") + print(f" Total Queries: {len(retrieval_results)}") + print(f" Avg Generation Time: {avg_gen_time:.3f}s") + print(" (Search time from Stage 4)") + + results_out["stage5"] = { + "total_queries": len(retrieval_results), + "avg_generation_time": avg_gen_time, + "generation_times": generation_times, + "responses": responses, + } + + # Show sample results + print("\nπŸ“ Sample Results:") + for i in range(min(3, len(retrieval_results))): + query = retrieval_results[i]["query"] + response = responses[i] + print(f" Q{i + 1}: {query[:60]}...") + print(f" A{i + 1}: {response[:100]}...") + print() + + except Exception as e: + print(f"❌ Generation evaluation failed: {e}") + print("πŸ’‘ Make sure transformers/vllm is installed and model is available") + + print("βœ… Stage 5 completed!\n") + if args.output and results_out: with open(args.output, "w", encoding="utf-8") as f: json.dump(results_out, f, indent=2) diff --git a/benchmarks/financebench/README.md b/benchmarks/financebench/README.md index 5683c9e..aefe9d6 100644 --- a/benchmarks/financebench/README.md +++ b/benchmarks/financebench/README.md @@ -45,9 +45,9 @@ This will: # Basic retrieval evaluation python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann -# Include QA evaluation with OpenAI -export OPENAI_API_KEY="your-key" -python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --qa-samples 20 + +# RAG generation evaluation with Qwen3-8B +python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --stage 4 --complexity 64 --llm-backend hf --model-name Qwen/Qwen3-8B --output results_qwen3.json ``` ## Evaluation Methods @@ -85,6 +85,24 @@ LLM-based answer evaluation using GPT-4o: *Note: Number match rate >100% indicates multiple retrieved documents contain the same financial figures, which is expected behavior for financial data appearing across multiple document sections. +### LEANN-RAG Generation Performance (Qwen3-8B) + +- **Stage 4 (Index Comparison):** + - Compact Index: 5.0 MB + - Non-compact Index: 172.2 MB + - **Storage Saving**: 97.1% +- **Search Performance**: + - Non-compact (no recompute): 0.009s avg per query + - Compact (with recompute): 2.203s avg per query + - Speed ratio: 0.004x + +**Generation Evaluation (20 queries, complexity=64):** +- **Average Search Time**: 1.638s per query +- **Average Generation Time**: 45.957s per query +- **LLM Backend**: HuggingFace transformers +- **Model**: Qwen/Qwen3-8B (thinking model with processing) +- **Total Questions Processed**: 20 + ## Options ```bash diff --git a/benchmarks/financebench/evaluate_financebench.py b/benchmarks/financebench/evaluate_financebench.py index 77758c9..803d57e 100755 --- a/benchmarks/financebench/evaluate_financebench.py +++ b/benchmarks/financebench/evaluate_financebench.py @@ -4,20 +4,25 @@ FinanceBench Evaluation Script - Modular Recall-based Evaluation import argparse import json +import logging import os import pickle import time +from pathlib import Path from typing import Optional import numpy as np import openai - -# Import LEANN modules - this will bring in the modified faiss from leann import LeannChat, LeannSearcher - -# Import LEANN's modified faiss directly from leann_backend_hnsw import faiss +from ..llm_utils import evaluate_rag, generate_hf, generate_vllm, load_hf_model, load_vllm_model + +# Setup logging to reduce verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger("leann.api").setLevel(logging.WARNING) +logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING) + class RecallEvaluator: """Stage 2: Evaluate Recall@3 (searcher vs baseline)""" @@ -125,7 +130,6 @@ class FinanceBenchEvaluator: def analyze_index_sizes(self) -> dict: """Analyze index sizes with and without embeddings""" - from pathlib import Path print("πŸ“ Analyzing index sizes...") @@ -136,7 +140,6 @@ class FinanceBenchEvaluator: sizes = {} total_with_embeddings = 0 - total_without_embeddings = 0 # Core index files index_file = index_dir / f"{index_name}.index" @@ -155,28 +158,14 @@ class FinanceBenchEvaluator: sizes[name] = size_mb total_with_embeddings += size_mb - # For pruned index calculation, exclude the main index file (contains embeddings) - if name != "index": - total_without_embeddings += size_mb else: sizes[name] = 0 - # Estimate pruned index size (approximate) - # When embeddings are removed, the main index file becomes much smaller - # Rough estimate: graph structure is ~10-20% of full index size - estimated_pruned_index_size = sizes["index"] * 0.15 # Conservative estimate - total_without_embeddings += estimated_pruned_index_size - sizes["total_with_embeddings"] = total_with_embeddings - sizes["total_without_embeddings"] = total_without_embeddings - sizes["estimated_pruned_index"] = estimated_pruned_index_size - sizes["compression_ratio"] = ( - total_without_embeddings / total_with_embeddings if total_with_embeddings > 0 else 0 - ) + sizes["index_only_mb"] = sizes["index"] # Just the .index file for fair comparison - print(f" πŸ“ Index with embeddings: {total_with_embeddings:.1f} MB") - print(f" πŸ“ Estimated pruned index: {total_without_embeddings:.1f} MB") - print(f" πŸ—œοΈ Compression ratio: {sizes['compression_ratio']:.2f}x") + print(f" πŸ“ Total index size: {total_with_embeddings:.1f} MB") + print(f" πŸ“ Index file only: {sizes['index']:.1f} MB") return sizes @@ -185,7 +174,6 @@ class FinanceBenchEvaluator: print("πŸ—οΈ Building compact index from existing passages...") # Load existing passages from current index - from pathlib import Path from leann import LeannBuilder @@ -241,7 +229,6 @@ class FinanceBenchEvaluator: print("πŸ—οΈ Building non-compact index from existing passages...") # Load existing passages from current index - from pathlib import Path from leann import LeannBuilder @@ -555,13 +542,7 @@ Respond with exactly one word: "CORRECT" if the generated answer is factually ac # Legacy single index analysis (fallback) if "total_with_embeddings" in timing_metrics and "current_index" not in timing_metrics: print("\nπŸ“ Index Size Analysis:") - print( - f" Index with embeddings: {timing_metrics.get('total_with_embeddings', 0):.1f} MB" - ) - print( - f" Estimated pruned index: {timing_metrics.get('total_without_embeddings', 0):.1f} MB" - ) - print(f" Compression ratio: {timing_metrics.get('compression_ratio', 0):.2f}x") + print(f" Total index size: {timing_metrics.get('total_with_embeddings', 0):.1f} MB") print("\nπŸ“Š Accuracy:") print(f" Accuracy: {timing_metrics.get('accuracy', 0) * 100:.1f}%") @@ -610,6 +591,10 @@ def main(): parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory") parser.add_argument("--openai-api-key", help="OpenAI API key for generation evaluation") parser.add_argument("--output", help="Save results to JSON file") + parser.add_argument( + "--llm-backend", choices=["openai", "hf", "vllm"], default="openai", help="LLM backend" + ) + parser.add_argument("--model-name", default="Qwen3-8B", help="Model name for HF/vLLM") args = parser.parse_args() @@ -768,7 +753,9 @@ def main(): print("πŸš€ Starting Stage 4: Comprehensive evaluation with dual index comparison") # Use FinanceBench evaluator for QA evaluation - evaluator = FinanceBenchEvaluator(args.index, args.openai_api_key) + evaluator = FinanceBenchEvaluator( + args.index, args.openai_api_key if args.llm_backend == "openai" else None + ) print("πŸ“– Loading FinanceBench dataset...") data = evaluator.load_dataset(args.dataset) @@ -802,20 +789,13 @@ def main(): print( f" Non-compact index: {non_compact_size_metrics['total_with_embeddings']:.1f} MB" ) - _ = ( - ( - non_compact_size_metrics["total_with_embeddings"] - - compact_size_metrics["total_with_embeddings"] - ) - / compact_size_metrics["total_with_embeddings"] - * 100 - ) + print("\nπŸ“Š Index-only size comparison (.index file only):") + print(f" Compact index: {compact_size_metrics['index_only_mb']:.1f} MB") + print(f" Non-compact index: {non_compact_size_metrics['index_only_mb']:.1f} MB") + # Use index-only size for fair comparison (same as Enron emails) storage_saving = ( - ( - non_compact_size_metrics["total_with_embeddings"] - - compact_size_metrics["total_with_embeddings"] - ) - / non_compact_size_metrics["total_with_embeddings"] + (non_compact_size_metrics["index_only_mb"] - compact_size_metrics["index_only_mb"]) + / non_compact_size_metrics["index_only_mb"] * 100 ) print(f" Storage saving by compact: {storage_saving:.1f}%") @@ -829,15 +809,58 @@ def main(): non_compact_index_path, args.index, data[:10], complexity=complexity ) - # Step 5: Timing breakdown evaluation WITH recompute (production mode) + # Step 5: Generation evaluation test_samples = 20 - print(f"\nπŸ§ͺ Testing with first {test_samples} samples for timing analysis") - print( - "\nπŸ”πŸ€– Running timing breakdown evaluation (WITH recompute - production mode)..." - ) - evaluation_start = time.time() - timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples]) - evaluation_time = time.time() - evaluation_start + print(f"\nπŸ§ͺ Testing with first {test_samples} samples for generation analysis") + + if args.llm_backend == "openai" and args.openai_api_key: + print("πŸ”πŸ€– Running OpenAI-based generation evaluation...") + evaluation_start = time.time() + timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples]) + evaluation_time = time.time() - evaluation_start + else: + print( + f"πŸ”πŸ€– Running {args.llm_backend} generation evaluation with {args.model_name}..." + ) + try: + # Load LLM + if args.llm_backend == "hf": + tokenizer, model = load_hf_model(args.model_name) + + def llm_func(prompt): + return generate_hf(tokenizer, model, prompt) + else: # vllm + llm, sampling_params = load_vllm_model(args.model_name) + + def llm_func(prompt): + return generate_vllm(llm, sampling_params, prompt) + + # Simple generation evaluation + queries = [item["question"] for item in data[:test_samples]] + gen_results = evaluate_rag( + evaluator.searcher, + llm_func, + queries, + domain="finance", + complexity=complexity, + ) + + timing_metrics = { + "total_questions": len(queries), + "avg_search_time": gen_results["avg_search_time"], + "avg_generation_time": gen_results["avg_generation_time"], + "results": gen_results["results"], + } + evaluation_time = time.time() + + except Exception as e: + print(f"❌ Generation evaluation failed: {e}") + timing_metrics = { + "total_questions": 0, + "avg_search_time": 0, + "avg_generation_time": 0, + } + evaluation_time = 0 # Combine all metrics combined_metrics = { @@ -849,8 +872,11 @@ def main(): "storage_saving_percent": storage_saving, } - # Print comprehensive results - evaluator._print_results(combined_metrics) + # Print results + print("\nπŸ“Š Generation Results:") + print(f" Total Questions: {timing_metrics.get('total_questions', 0)}") + print(f" Avg Search Time: {timing_metrics.get('avg_search_time', 0):.3f}s") + print(f" Avg Generation Time: {timing_metrics.get('avg_generation_time', 0):.3f}s") # Save results if requested if args.output: diff --git a/benchmarks/laion/README.md b/benchmarks/laion/README.md index 516f347..51dc721 100644 --- a/benchmarks/laion/README.md +++ b/benchmarks/laion/README.md @@ -1,6 +1,6 @@ # LAION Multimodal Benchmark -A multimodal benchmark for evaluating image retrieval performance using LEANN with CLIP embeddings on LAION dataset subset. +A multimodal benchmark for evaluating image retrieval and generation performance using LEANN with CLIP embeddings and Qwen2.5-VL for multimodal generation on LAION dataset subset. ## Overview @@ -9,6 +9,7 @@ This benchmark evaluates: - **Recall@K performance** for image search - **Complexity analysis** across different search parameters - **Index size and storage efficiency** +- **Multimodal generation** with Qwen2.5-VL for image understanding and description ## Dataset Configuration @@ -39,9 +40,13 @@ This will: python evaluate_laion.py --index data/laion_index.leann # Run specific stages -python evaluate_laion.py --index data/laion_index.leann --stage timing -python evaluate_laion.py --index data/laion_index.leann --stage recall -python evaluate_laion.py --index data/laion_index.leann --stage complexity +python evaluate_laion.py --index data/laion_index.leann --stage 2 # Recall evaluation +python evaluate_laion.py --index data/laion_index.leann --stage 3 # Complexity analysis +python evaluate_laion.py --index data/laion_index.leann --stage 4 # Index comparison +python evaluate_laion.py --index data/laion_index.leann --stage 5 # Multimodal generation + +# Multimodal generation with Qwen2.5-VL +python evaluate_laion.py --index data/laion_index.leann --stage 5 --model-name Qwen/Qwen2.5-VL-7B-Instruct ``` ### 3. Save results @@ -74,23 +79,26 @@ python evaluate_laion.py \ ## Evaluation Stages -### Stage 1: Index Analysis -- Analyzes index file sizes and metadata -- Reports storage efficiency - -### Stage 2: Search Timing -- Measures average search latency -- Tests with configurable complexity and top-k -- Reports searches per second - -### Stage 3: Recall Evaluation -- Evaluates Recall@K using ground truth +### Stage 2: Recall Evaluation +- Evaluates Recall@3 for multimodal retrieval +- Compares LEANN vs FAISS baseline performance - Self-recall: query caption should retrieve original image -### Stage 4: Complexity Analysis -- Tests performance across different complexity levels [16, 32, 64, 128] +### Stage 3: Complexity Analysis +- Binary search for optimal complexity (90% recall target) +- Tests performance across different complexity levels - Analyzes speed vs. accuracy tradeoffs +### Stage 4: Index Comparison +- Compares compact vs non-compact index sizes +- Measures search performance differences +- Reports storage efficiency and speed ratios + +### Stage 5: Multimodal Generation +- Uses Qwen2.5-VL for image understanding and description +- Retrieval-Augmented Generation (RAG) with multimodal context +- Measures both search and generation timing + ## Output Metrics ### Timing Metrics @@ -100,48 +108,70 @@ python evaluate_laion.py \ - Latency in milliseconds ### Recall Metrics -- Recall@K percentage +- Recall@3 percentage for image retrieval - Number of queries with ground truth ### Index Metrics - Total index size (MB) - Component breakdown (index, passages, metadata) +- Storage savings (compact vs non-compact) - Backend and embedding model info -## Example Results +### Generation Metrics (Stage 5) +- Average search time per query +- Average generation time per query +- Time distribution (search vs generation) +- Sample multimodal responses +- Model: Qwen2.5-VL performance + +## Benchmark Results + +### LEANN-RAG Performance (CLIP ViT-L/14 + Qwen2.5-VL) + +**Stage 3: Optimal Complexity Analysis** +- **Optimal Complexity**: 85 (achieving 90% Recall@3) +- **Binary Search Range**: 1-128 +- **Target Recall**: 90% +- **Index Type**: Non-compact (for fast binary search) + +**Stage 5: Multimodal Generation Performance (Qwen2.5-VL)** +- **Total Queries**: 20 +- **Average Search Time**: 1.200s per query +- **Average Generation Time**: 6.558s per query +- **Time Distribution**: Search 15.5%, Generation 84.5% +- **LLM Backend**: HuggingFace transformers +- **Model**: Qwen/Qwen2.5-VL-7B-Instruct +- **Optimal Complexity**: 85 + +**System Performance:** +- **Index Size**: ~10,000 image embeddings from LAION subset +- **Embedding Model**: CLIP ViT-L/14 (768 dimensions) +- **Backend**: HNSW with cosine distance + +### Example Results ``` 🎯 LAION MULTIMODAL BENCHMARK RESULTS ============================================================ -πŸ“ Index Information: - Total size: 145.2 MB - Backend: hnsw - Embedding model: clip-vit-b-32 - Total passages: 10000 +πŸ“Š Multimodal Generation Results: + Total Queries: 20 + Avg Search Time: 1.200s + Avg Generation Time: 6.558s + Time Distribution: Search 15.5%, Generation 84.5% + LLM Backend: HuggingFace transformers + Model: Qwen/Qwen2.5-VL-7B-Instruct -⚑ Search Performance: - Total queries: 200 - Average search time: 0.023s - Median search time: 0.021s - Min/Max search time: 0.012s / 0.089s - Std dev: 0.008s - Complexity: 64 - Top-K: 3 - -πŸ“Š Recall Performance: - Recall@3: 85.5% - Queries with ground truth: 200 - -βš™οΈ Complexity Analysis: - Complexity 16: 0.015s avg - Complexity 32: 0.019s avg - Complexity 64: 0.023s avg - Complexity 128: 0.031s avg +βš™οΈ Optimal Complexity Analysis: + Target Recall: 90% + Optimal Complexity: 85 + Binary Search Range: 1-128 + Non-compact Index (fast search, no recompute) πŸš€ Performance Summary: - Searches per second: 43.5 - Latency (ms): 23.0ms + Multimodal RAG: 7.758s total per query + Search: 15.5% of total time + Generation: 84.5% of total time ``` ## Directory Structure diff --git a/benchmarks/laion/evaluate_laion.py b/benchmarks/laion/evaluate_laion.py index 1383ae5..dd30635 100644 --- a/benchmarks/laion/evaluate_laion.py +++ b/benchmarks/laion/evaluate_laion.py @@ -4,6 +4,7 @@ LAION Multimodal Benchmark Evaluation Script - Modular Recall-based Evaluation import argparse import json +import logging import os import pickle import time @@ -14,6 +15,13 @@ from leann import LeannSearcher from leann_backend_hnsw import faiss from sentence_transformers import SentenceTransformer +from ..llm_utils import evaluate_multimodal_rag, load_qwen_vl_model + +# Setup logging to reduce verbose output +logging.basicConfig(level=logging.WARNING) +logging.getLogger("leann.api").setLevel(logging.WARNING) +logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING) + class RecallEvaluator: """Stage 2: Evaluate Recall@3 (LEANN vs FAISS baseline for multimodal retrieval)""" @@ -388,13 +396,22 @@ def main(): ) parser.add_argument( "--stage", - choices=["2", "3", "4", "all"], + choices=["2", "3", "4", "5", "all"], default="all", - help="Which stage to run (2=recall, 3=complexity, 4=index comparison)", + help="Which stage to run (2=recall, 3=complexity, 4=index comparison, 5=generation)", ) parser.add_argument("--complexity", type=int, default=None, help="Complexity for search") parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory") parser.add_argument("--output", help="Save results to JSON file") + parser.add_argument( + "--llm-backend", + choices=["hf"], + default="hf", + help="LLM backend (Qwen2.5-VL only supports HF)", + ) + parser.add_argument( + "--model-name", default="Qwen/Qwen2.5-VL-7B-Instruct", help="Multimodal model name" + ) args = parser.parse_args() @@ -615,12 +632,69 @@ def main(): evaluator.cleanup() print("βœ… Stage 4 completed!\n") + if args.stage in ("5", "all"): + print("πŸš€ Starting Stage 5: Multimodal generation with Qwen2.5-VL") + evaluator = LAIONEvaluator(args.index) + captions = evaluator.load_queries(args.queries) + test_captions = captions[: min(20, len(captions))] # Use subset for generation + + print(f"πŸ§ͺ Testing multimodal generation with {len(test_captions)} queries") + + # Load Qwen2.5-VL model + try: + print("Loading Qwen2.5-VL model...") + processor, model = load_qwen_vl_model(args.model_name) + + # Run multimodal generation evaluation + complexity = args.complexity or 64 + gen_results = evaluate_multimodal_rag( + evaluator.searcher, + test_captions, + processor=processor, + model=model, + complexity=complexity, + ) + + print("\nπŸ“Š Multimodal Generation Results:") + print(f" Total Queries: {len(test_captions)}") + print(f" Avg Search Time: {gen_results['avg_search_time']:.3f}s") + print(f" Avg Generation Time: {gen_results['avg_generation_time']:.3f}s") + total_time = gen_results["avg_search_time"] + gen_results["avg_generation_time"] + search_pct = (gen_results["avg_search_time"] / total_time) * 100 + gen_pct = (gen_results["avg_generation_time"] / total_time) * 100 + print(f" Time Distribution: Search {search_pct:.1f}%, Generation {gen_pct:.1f}%") + print(" LLM Backend: HuggingFace transformers") + print(f" Model: {args.model_name}") + + # Show sample results + print("\nπŸ“ Sample Multimodal Generations:") + for i, response in enumerate(gen_results["results"][:3]): + # Handle both string and dict formats for captions + if isinstance(test_captions[i], dict): + caption_text = test_captions[i].get("query", str(test_captions[i])) + else: + caption_text = str(test_captions[i]) + print(f" Query {i + 1}: {caption_text[:60]}...") + print(f" Response {i + 1}: {response[:100]}...") + print() + + except Exception as e: + print(f"❌ Multimodal generation evaluation failed: {e}") + print("πŸ’‘ Make sure transformers and Qwen2.5-VL are installed") + import traceback + + traceback.print_exc() + + evaluator.cleanup() + print("βœ… Stage 5 completed!\n") + if args.stage == "all": print("πŸŽ‰ All evaluation stages completed successfully!") print("\nπŸ“‹ Summary:") print(" Stage 2: βœ… Multimodal Recall@3 evaluation completed") print(" Stage 3: βœ… Optimal complexity found") print(" Stage 4: βœ… Index comparison analysis completed") + print(" Stage 5: βœ… Multimodal generation evaluation completed") print("\nπŸ”§ Recommended next steps:") print(" - Use optimal complexity for best speed/accuracy balance") print(" - Review index comparison for storage vs performance tradeoffs") diff --git a/benchmarks/llm_utils.py b/benchmarks/llm_utils.py new file mode 100644 index 0000000..9a8217c --- /dev/null +++ b/benchmarks/llm_utils.py @@ -0,0 +1,301 @@ +""" +LLM utils for RAG benchmarks with Qwen3-8B and Qwen2.5-VL (multimodal) +""" + +import time + +try: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + HF_AVAILABLE = True +except ImportError: + HF_AVAILABLE = False + +try: + from vllm import LLM, SamplingParams + + VLLM_AVAILABLE = True +except ImportError: + VLLM_AVAILABLE = False + + +def is_qwen3_model(model_name): + """Check if model is Qwen3""" + return "Qwen3" in model_name or "qwen3" in model_name.lower() + + +def is_qwen_vl_model(model_name): + """Check if model is Qwen2.5-VL""" + return "Qwen2.5-VL" in model_name or "qwen2.5-vl" in model_name.lower() + + +def apply_qwen3_chat_template(tokenizer, prompt): + """Apply Qwen3 chat template with thinking enabled""" + messages = [{"role": "user", "content": prompt}] + return tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=True, + ) + + +def extract_thinking_answer(response): + """Extract final answer from Qwen3 thinking model response""" + if "" in response and "" in response: + try: + think_end = response.index("") + len("") + final_answer = response[think_end:].strip() + return final_answer + except (ValueError, IndexError): + pass + + return response.strip() + + +def load_hf_model(model_name="Qwen/Qwen3-8B"): + """Load HuggingFace model""" + if not HF_AVAILABLE: + raise ImportError("transformers not available") + + print(f"Loading HF: {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, + device_map="auto", + trust_remote_code=True, + ) + return tokenizer, model + + +def load_vllm_model(model_name="Qwen/Qwen3-8B"): + """Load vLLM model""" + if not VLLM_AVAILABLE: + raise ImportError("vllm not available") + + print(f"Loading vLLM: {model_name}") + llm = LLM(model=model_name, trust_remote_code=True) + + # Qwen3 specific config + if is_qwen3_model(model_name): + stop_tokens = ["<|im_end|>", "<|end_of_text|>"] + max_tokens = 2048 + else: + stop_tokens = None + max_tokens = 1024 + + sampling_params = SamplingParams(temperature=0.7, max_tokens=max_tokens, stop=stop_tokens) + return llm, sampling_params + + +def generate_hf(tokenizer, model, prompt, max_tokens=None): + """Generate with HF - supports Qwen3 thinking models""" + model_name = getattr(model, "name_or_path", "unknown") + is_qwen3 = is_qwen3_model(model_name) + + # Apply chat template for Qwen3 + if is_qwen3: + prompt = apply_qwen3_chat_template(tokenizer, prompt) + max_tokens = max_tokens or 2048 + else: + max_tokens = max_tokens or 1024 + + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=max_tokens, + temperature=0.7, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + response = response[len(prompt) :].strip() + + # Extract final answer for thinking models + if is_qwen3: + return extract_thinking_answer(response) + return response + + +def generate_vllm(llm, sampling_params, prompt): + """Generate with vLLM - supports Qwen3 thinking models""" + outputs = llm.generate([prompt], sampling_params) + response = outputs[0].outputs[0].text.strip() + + # Extract final answer for Qwen3 thinking models + model_name = str(llm.llm_engine.model_config.model) + if is_qwen3_model(model_name): + return extract_thinking_answer(response) + return response + + +def create_prompt(context, query, domain="default"): + """Create RAG prompt""" + if domain == "emails": + return f"Email content:\n{context}\n\nQuestion: {query}\n\nAnswer:" + elif domain == "finance": + return f"Financial content:\n{context}\n\nQuestion: {query}\n\nAnswer:" + elif domain == "multimodal": + return f"Image context:\n{context}\n\nQuestion: {query}\n\nAnswer:" + else: + return f"Context: {context}\n\nQuestion: {query}\n\nAnswer:" + + +def evaluate_rag(searcher, llm_func, queries, domain="default", top_k=3, complexity=64): + """Simple RAG evaluation with timing""" + search_times = [] + gen_times = [] + results = [] + + for i, query in enumerate(queries): + # Search + start = time.time() + docs = searcher.search(query, top_k=top_k, complexity=complexity) + search_time = time.time() - start + + # Generate + context = "\n\n".join([doc.text for doc in docs]) + prompt = create_prompt(context, query, domain) + + start = time.time() + response = llm_func(prompt) + gen_time = time.time() - start + + search_times.append(search_time) + gen_times.append(gen_time) + results.append(response) + + if i < 3: + print(f"Q{i + 1}: Search={search_time:.3f}s, Gen={gen_time:.3f}s") + + return { + "avg_search_time": sum(search_times) / len(search_times), + "avg_generation_time": sum(gen_times) / len(gen_times), + "results": results, + } + + +def load_qwen_vl_model(model_name="Qwen/Qwen2.5-VL-7B-Instruct"): + """Load Qwen2.5-VL multimodal model""" + if not HF_AVAILABLE: + raise ImportError("transformers not available") + + print(f"Loading Qwen2.5-VL: {model_name}") + + try: + from transformers import AutoModelForVision2Seq, AutoProcessor + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + model = AutoModelForVision2Seq.from_pretrained( + model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True + ) + + return processor, model + + except Exception as e: + print(f"Failed to load with AutoModelForVision2Seq, trying specific class: {e}") + + # Fallback to specific class + try: + from transformers import AutoProcessor, Qwen2VLForConditionalGeneration + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + model = Qwen2VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True + ) + + return processor, model + + except Exception as e2: + raise ImportError(f"Failed to load Qwen2.5-VL model: {e2}") + + +def generate_qwen_vl(processor, model, prompt, image_path=None, max_tokens=512): + """Generate with Qwen2.5-VL multimodal model""" + from PIL import Image + + # Prepare inputs + if image_path: + image = Image.open(image_path) + inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device) + else: + inputs = processor(text=prompt, return_tensors="pt").to(model.device) + + # Generate + with torch.no_grad(): + generated_ids = model.generate( + **inputs, max_new_tokens=max_tokens, do_sample=False, temperature=0.1 + ) + + # Decode response + generated_ids = generated_ids[:, inputs["input_ids"].shape[1] :] + response = processor.decode(generated_ids[0], skip_special_tokens=True) + + return response + + +def create_multimodal_prompt(context, query, image_descriptions, task_type="images"): + """Create prompt for multimodal RAG""" + if task_type == "images": + return f"""Based on the retrieved images and their descriptions, answer the following question. + +Retrieved Image Descriptions: +{context} + +Question: {query} + +Provide a detailed answer based on the visual content described above.""" + + return f"Context: {context}\nQuestion: {query}\nAnswer:" + + +def evaluate_multimodal_rag(searcher, queries, processor=None, model=None, complexity=64): + """Evaluate multimodal RAG with Qwen2.5-VL""" + search_times = [] + gen_times = [] + results = [] + + for i, query_item in enumerate(queries): + # Handle both string and dict formats for queries + if isinstance(query_item, dict): + query = query_item.get("query", "") + image_path = query_item.get("image_path") # Optional reference image + else: + query = str(query_item) + image_path = None + + # Search + start_time = time.time() + search_results = searcher.search(query, top_k=3, complexity=complexity) + search_time = time.time() - start_time + search_times.append(search_time) + + # Prepare context from search results + context_parts = [] + for result in search_results: + context_parts.append(f"- {result.text}") + context = "\n".join(context_parts) + + # Generate with multimodal model + start_time = time.time() + if processor and model: + prompt = create_multimodal_prompt(context, query, context_parts) + response = generate_qwen_vl(processor, model, prompt, image_path) + else: + response = f"Context: {context}" + gen_time = time.time() - start_time + + gen_times.append(gen_time) + results.append(response) + + if i < 3: + print(f"Q{i + 1}: Search={search_time:.3f}s, Gen={gen_time:.3f}s") + + return { + "avg_search_time": sum(search_times) / len(search_times), + "avg_generation_time": sum(gen_times) / len(gen_times), + "results": results, + } diff --git a/pyproject.toml b/pyproject.toml index d738017..35e5613 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies = [ "tree-sitter-java>=0.20.0", "tree-sitter-c-sharp>=0.20.0", "tree-sitter-typescript>=0.20.0", + "torchvision>=0.23.0", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index d4fa777..d01612b 100644 --- a/uv.lock +++ b/uv.lock @@ -1564,7 +1564,7 @@ name = "importlib-metadata" version = "8.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } wheels = [ @@ -2257,6 +2257,7 @@ dependencies = [ { name = "sentence-transformers" }, { name = "sglang" }, { name = "torch" }, + { name = "torchvision" }, { name = "tqdm" }, { name = "tree-sitter", version = "0.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "tree-sitter", version = "0.25.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -2346,6 +2347,7 @@ requires-dist = [ { name = "sentence-transformers", specifier = ">=2.2.0" }, { name = "sglang" }, { name = "torch" }, + { name = "torchvision", specifier = ">=0.23.0" }, { name = "tqdm" }, { name = "tree-sitter", specifier = ">=0.20.0" }, { name = "tree-sitter-c-sharp", specifier = ">=0.20.0" }, @@ -5870,6 +5872,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" }, ] +[[package]] +name = "torchvision" +version = "0.23.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pillow" }, + { name = "torch" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/49/5ad5c3ff4920be0adee9eb4339b4fb3b023a0fc55b9ed8dbc73df92946b8/torchvision-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7266871daca00ad46d1c073e55d972179d12a58fa5c9adec9a3db9bbed71284a", size = 1856885, upload-time = "2025-08-06T14:57:55.024Z" }, + { url = "https://files.pythonhosted.org/packages/25/44/ddd56d1637bac42a8c5da2c8c440d8a28c431f996dd9790f32dd9a96ca6e/torchvision-0.23.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31c583ba27426a3a04eca8c05450524105c1564db41be6632f7536ef405a6de2", size = 2394251, upload-time = "2025-08-06T14:58:01.725Z" }, + { url = "https://files.pythonhosted.org/packages/93/f3/3cdf55bbf0f737304d997561c34ab0176222e0496b6743b0feab5995182c/torchvision-0.23.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3932bf67256f2d095ce90a9f826f6033694c818856f4bb26794cf2ce64253e53", size = 8627497, upload-time = "2025-08-06T14:58:09.317Z" }, + { url = "https://files.pythonhosted.org/packages/97/90/02afe57c3ef4284c5cf89d3b7ae203829b3a981f72b93a7dd2a3fd2c83c1/torchvision-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:83ee5bf827d61a8af14620c0a61d8608558638ac9c3bac8adb7b27138e2147d1", size = 1600760, upload-time = "2025-08-06T14:57:56.783Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d7/15d3d7bd8d0239211b21673d1bac7bc345a4ad904a8e25bb3fd8a9cf1fbc/torchvision-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49aa20e21f0c2bd458c71d7b449776cbd5f16693dd5807195a820612b8a229b7", size = 1856884, upload-time = "2025-08-06T14:58:00.237Z" }, + { url = "https://files.pythonhosted.org/packages/dd/14/7b44fe766b7d11e064c539d92a172fa9689a53b69029e24f2f1f51e7dc56/torchvision-0.23.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:01dc33ee24c79148aee7cdbcf34ae8a3c9da1674a591e781577b716d233b1fa6", size = 2395543, upload-time = "2025-08-06T14:58:04.373Z" }, + { url = "https://files.pythonhosted.org/packages/79/9c/fcb09aff941c8147d9e6aa6c8f67412a05622b0c750bcf796be4c85a58d4/torchvision-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35c27941831b653f5101edfe62c03d196c13f32139310519e8228f35eae0e96a", size = 8628388, upload-time = "2025-08-06T14:58:07.802Z" }, + { url = "https://files.pythonhosted.org/packages/93/40/3415d890eb357b25a8e0a215d32365a88ecc75a283f75c4e919024b22d97/torchvision-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:09bfde260e7963a15b80c9e442faa9f021c7e7f877ac0a36ca6561b367185013", size = 1600741, upload-time = "2025-08-06T14:57:59.158Z" }, + { url = "https://files.pythonhosted.org/packages/df/1d/0ea0b34bde92a86d42620f29baa6dcbb5c2fc85990316df5cb8f7abb8ea2/torchvision-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e0e2c04a91403e8dd3af9756c6a024a1d9c0ed9c0d592a8314ded8f4fe30d440", size = 1856885, upload-time = "2025-08-06T14:58:06.503Z" }, + { url = "https://files.pythonhosted.org/packages/e2/00/2f6454decc0cd67158c7890364e446aad4b91797087a57a78e72e1a8f8bc/torchvision-0.23.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6dd7c4d329a0e03157803031bc856220c6155ef08c26d4f5bbac938acecf0948", size = 2396614, upload-time = "2025-08-06T14:58:03.116Z" }, + { url = "https://files.pythonhosted.org/packages/e4/b5/3e580dcbc16f39a324f3dd71b90edbf02a42548ad44d2b4893cc92b1194b/torchvision-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4e7d31c43bc7cbecbb1a5652ac0106b436aa66e26437585fc2c4b2cf04d6014c", size = 8627108, upload-time = "2025-08-06T14:58:12.956Z" }, + { url = "https://files.pythonhosted.org/packages/82/c1/c2fe6d61e110a8d0de2f94276899a2324a8f1e6aee559eb6b4629ab27466/torchvision-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:a2e45272abe7b8bf0d06c405e78521b5757be1bd0ed7e5cd78120f7fdd4cbf35", size = 1600723, upload-time = "2025-08-06T14:57:57.986Z" }, + { url = "https://files.pythonhosted.org/packages/91/37/45a5b9407a7900f71d61b2b2f62db4b7c632debca397f205fdcacb502780/torchvision-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1c37e325e09a184b730c3ef51424f383ec5745378dc0eca244520aca29722600", size = 1856886, upload-time = "2025-08-06T14:58:05.491Z" }, + { url = "https://files.pythonhosted.org/packages/ac/da/a06c60fc84fc849377cf035d3b3e9a1c896d52dbad493b963c0f1cdd74d0/torchvision-0.23.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:2f7fd6c15f3697e80627b77934f77705f3bc0e98278b989b2655de01f6903e1d", size = 2353112, upload-time = "2025-08-06T14:58:26.265Z" }, + { url = "https://files.pythonhosted.org/packages/a0/27/5ce65ba5c9d3b7d2ccdd79892ab86a2f87ac2ca6638f04bb0280321f1a9c/torchvision-0.23.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:a76fafe113b2977be3a21bf78f115438c1f88631d7a87203acb3dd6ae55889e6", size = 8627658, upload-time = "2025-08-06T14:58:15.999Z" }, + { url = "https://files.pythonhosted.org/packages/1f/e4/028a27b60aa578a2fa99d9d7334ff1871bb17008693ea055a2fdee96da0d/torchvision-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:07d069cb29691ff566e3b7f11f20d91044f079e1dbdc9d72e0655899a9b06938", size = 1600749, upload-time = "2025-08-06T14:58:10.719Z" }, + { url = "https://files.pythonhosted.org/packages/05/35/72f91ad9ac7c19a849dedf083d347dc1123f0adeb401f53974f84f1d04c8/torchvision-0.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:2df618e1143805a7673aaf82cb5720dd9112d4e771983156aaf2ffff692eebf9", size = 2047192, upload-time = "2025-08-06T14:58:11.813Z" }, + { url = "https://files.pythonhosted.org/packages/1d/9d/406cea60a9eb9882145bcd62a184ee61e823e8e1d550cdc3c3ea866a9445/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a3299d2b1d5a7aed2d3b6ffb69c672ca8830671967eb1cee1497bacd82fe47b", size = 2359295, upload-time = "2025-08-06T14:58:17.469Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f4/34662f71a70fa1e59de99772142f22257ca750de05ccb400b8d2e3809c1d/torchvision-0.23.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:76bc4c0b63d5114aa81281390f8472a12a6a35ce9906e67ea6044e5af4cab60c", size = 8800474, upload-time = "2025-08-06T14:58:22.53Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f5/b5a2d841a8d228b5dbda6d524704408e19e7ca6b7bb0f24490e081da1fa1/torchvision-0.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e2dabf0da9c8aa9ea241afb63a8f3e98489e706b22ac3f30416a1be377153b", size = 1527667, upload-time = "2025-08-06T14:58:14.446Z" }, + { url = "https://files.pythonhosted.org/packages/d5/3e/f1f3bb3dd452b98ec2eba4820d777440abceb3d3a428a6c8243006fe47e5/torchvision-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b190db205f90206c230fc2f91cbdfd5733334babc0e0d19bddb90a40b8cf26c2", size = 1856927, upload-time = "2025-08-06T14:58:18.919Z" }, + { url = "https://files.pythonhosted.org/packages/f4/e2/aafc6af854e792d212ff58e459f8d5d807568dc3f2b49ec41b677275e5a9/torchvision-0.23.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6c74cbc1cbee26dd4f35f989cd80dccc40411f258dee476b29871dee4b483af0", size = 2392870, upload-time = "2025-08-06T14:58:21.303Z" }, + { url = "https://files.pythonhosted.org/packages/5d/06/09b6a917b3759ef000428af0aa2597f983e20d9fbbcfeb826750f778fe6d/torchvision-0.23.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a9e9d7552d34547b80843eaf64ab0737b19b2e8bec2514286b8cfd30861ca8b5", size = 8630400, upload-time = "2025-08-06T14:58:24.139Z" }, + { url = "https://files.pythonhosted.org/packages/08/07/ae46106efbf4bbc0090078aa4c406c38282cbe4e637bdb4b7f2e984140af/torchvision-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:dc7ce5accbbb8c9df9a79f8cef6a6df042f28e2250a6ae0d2ca70b06473fa03b", size = 1600751, upload-time = "2025-08-06T14:58:20.027Z" }, +] + [[package]] name = "tornado" version = "6.5.2"