docs: data updated

This commit is contained in:
Andy Lee
2025-09-15 19:50:02 -07:00
parent d7011bbea0
commit a0d6857faa
9 changed files with 749 additions and 133 deletions

View File

@@ -45,9 +45,9 @@ This will:
# Basic retrieval evaluation
python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann
# Include QA evaluation with OpenAI
export OPENAI_API_KEY="your-key"
python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --qa-samples 20
# RAG generation evaluation with Qwen3-8B
python evaluate_financebench.py --index data/index/financebench_full_hnsw.leann --stage 4 --complexity 64 --llm-backend hf --model-name Qwen/Qwen3-8B --output results_qwen3.json
```
## Evaluation Methods
@@ -85,6 +85,24 @@ LLM-based answer evaluation using GPT-4o:
*Note: Number match rate >100% indicates multiple retrieved documents contain the same financial figures, which is expected behavior for financial data appearing across multiple document sections.
### LEANN-RAG Generation Performance (Qwen3-8B)
- **Stage 4 (Index Comparison):**
- Compact Index: 5.0 MB
- Non-compact Index: 172.2 MB
- **Storage Saving**: 97.1%
- **Search Performance**:
- Non-compact (no recompute): 0.009s avg per query
- Compact (with recompute): 2.203s avg per query
- Speed ratio: 0.004x
**Generation Evaluation (20 queries, complexity=64):**
- **Average Search Time**: 1.638s per query
- **Average Generation Time**: 45.957s per query
- **LLM Backend**: HuggingFace transformers
- **Model**: Qwen/Qwen3-8B (thinking model with <think></think> processing)
- **Total Questions Processed**: 20
## Options
```bash

View File

@@ -4,20 +4,25 @@ FinanceBench Evaluation Script - Modular Recall-based Evaluation
import argparse
import json
import logging
import os
import pickle
import time
from pathlib import Path
from typing import Optional
import numpy as np
import openai
# Import LEANN modules - this will bring in the modified faiss
from leann import LeannChat, LeannSearcher
# Import LEANN's modified faiss directly
from leann_backend_hnsw import faiss
from ..llm_utils import evaluate_rag, generate_hf, generate_vllm, load_hf_model, load_vllm_model
# Setup logging to reduce verbose output
logging.basicConfig(level=logging.WARNING)
logging.getLogger("leann.api").setLevel(logging.WARNING)
logging.getLogger("leann_backend_hnsw").setLevel(logging.WARNING)
class RecallEvaluator:
"""Stage 2: Evaluate Recall@3 (searcher vs baseline)"""
@@ -125,7 +130,6 @@ class FinanceBenchEvaluator:
def analyze_index_sizes(self) -> dict:
"""Analyze index sizes with and without embeddings"""
from pathlib import Path
print("📏 Analyzing index sizes...")
@@ -136,7 +140,6 @@ class FinanceBenchEvaluator:
sizes = {}
total_with_embeddings = 0
total_without_embeddings = 0
# Core index files
index_file = index_dir / f"{index_name}.index"
@@ -155,28 +158,14 @@ class FinanceBenchEvaluator:
sizes[name] = size_mb
total_with_embeddings += size_mb
# For pruned index calculation, exclude the main index file (contains embeddings)
if name != "index":
total_without_embeddings += size_mb
else:
sizes[name] = 0
# Estimate pruned index size (approximate)
# When embeddings are removed, the main index file becomes much smaller
# Rough estimate: graph structure is ~10-20% of full index size
estimated_pruned_index_size = sizes["index"] * 0.15 # Conservative estimate
total_without_embeddings += estimated_pruned_index_size
sizes["total_with_embeddings"] = total_with_embeddings
sizes["total_without_embeddings"] = total_without_embeddings
sizes["estimated_pruned_index"] = estimated_pruned_index_size
sizes["compression_ratio"] = (
total_without_embeddings / total_with_embeddings if total_with_embeddings > 0 else 0
)
sizes["index_only_mb"] = sizes["index"] # Just the .index file for fair comparison
print(f" 📁 Index with embeddings: {total_with_embeddings:.1f} MB")
print(f" 📁 Estimated pruned index: {total_without_embeddings:.1f} MB")
print(f" 🗜️ Compression ratio: {sizes['compression_ratio']:.2f}x")
print(f" 📁 Total index size: {total_with_embeddings:.1f} MB")
print(f" 📁 Index file only: {sizes['index']:.1f} MB")
return sizes
@@ -185,7 +174,6 @@ class FinanceBenchEvaluator:
print("🏗️ Building compact index from existing passages...")
# Load existing passages from current index
from pathlib import Path
from leann import LeannBuilder
@@ -241,7 +229,6 @@ class FinanceBenchEvaluator:
print("🏗️ Building non-compact index from existing passages...")
# Load existing passages from current index
from pathlib import Path
from leann import LeannBuilder
@@ -555,13 +542,7 @@ Respond with exactly one word: "CORRECT" if the generated answer is factually ac
# Legacy single index analysis (fallback)
if "total_with_embeddings" in timing_metrics and "current_index" not in timing_metrics:
print("\n📏 Index Size Analysis:")
print(
f" Index with embeddings: {timing_metrics.get('total_with_embeddings', 0):.1f} MB"
)
print(
f" Estimated pruned index: {timing_metrics.get('total_without_embeddings', 0):.1f} MB"
)
print(f" Compression ratio: {timing_metrics.get('compression_ratio', 0):.2f}x")
print(f" Total index size: {timing_metrics.get('total_with_embeddings', 0):.1f} MB")
print("\n📊 Accuracy:")
print(f" Accuracy: {timing_metrics.get('accuracy', 0) * 100:.1f}%")
@@ -610,6 +591,10 @@ def main():
parser.add_argument("--baseline-dir", default="baseline", help="Baseline output directory")
parser.add_argument("--openai-api-key", help="OpenAI API key for generation evaluation")
parser.add_argument("--output", help="Save results to JSON file")
parser.add_argument(
"--llm-backend", choices=["openai", "hf", "vllm"], default="openai", help="LLM backend"
)
parser.add_argument("--model-name", default="Qwen3-8B", help="Model name for HF/vLLM")
args = parser.parse_args()
@@ -768,7 +753,9 @@ def main():
print("🚀 Starting Stage 4: Comprehensive evaluation with dual index comparison")
# Use FinanceBench evaluator for QA evaluation
evaluator = FinanceBenchEvaluator(args.index, args.openai_api_key)
evaluator = FinanceBenchEvaluator(
args.index, args.openai_api_key if args.llm_backend == "openai" else None
)
print("📖 Loading FinanceBench dataset...")
data = evaluator.load_dataset(args.dataset)
@@ -802,20 +789,13 @@ def main():
print(
f" Non-compact index: {non_compact_size_metrics['total_with_embeddings']:.1f} MB"
)
_ = (
(
non_compact_size_metrics["total_with_embeddings"]
- compact_size_metrics["total_with_embeddings"]
)
/ compact_size_metrics["total_with_embeddings"]
* 100
)
print("\n📊 Index-only size comparison (.index file only):")
print(f" Compact index: {compact_size_metrics['index_only_mb']:.1f} MB")
print(f" Non-compact index: {non_compact_size_metrics['index_only_mb']:.1f} MB")
# Use index-only size for fair comparison (same as Enron emails)
storage_saving = (
(
non_compact_size_metrics["total_with_embeddings"]
- compact_size_metrics["total_with_embeddings"]
)
/ non_compact_size_metrics["total_with_embeddings"]
(non_compact_size_metrics["index_only_mb"] - compact_size_metrics["index_only_mb"])
/ non_compact_size_metrics["index_only_mb"]
* 100
)
print(f" Storage saving by compact: {storage_saving:.1f}%")
@@ -829,15 +809,58 @@ def main():
non_compact_index_path, args.index, data[:10], complexity=complexity
)
# Step 5: Timing breakdown evaluation WITH recompute (production mode)
# Step 5: Generation evaluation
test_samples = 20
print(f"\n🧪 Testing with first {test_samples} samples for timing analysis")
print(
"\n🔍🤖 Running timing breakdown evaluation (WITH recompute - production mode)..."
)
evaluation_start = time.time()
timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples])
evaluation_time = time.time() - evaluation_start
print(f"\n🧪 Testing with first {test_samples} samples for generation analysis")
if args.llm_backend == "openai" and args.openai_api_key:
print("🔍🤖 Running OpenAI-based generation evaluation...")
evaluation_start = time.time()
timing_metrics = evaluator.evaluate_timing_breakdown(data[:test_samples])
evaluation_time = time.time() - evaluation_start
else:
print(
f"🔍🤖 Running {args.llm_backend} generation evaluation with {args.model_name}..."
)
try:
# Load LLM
if args.llm_backend == "hf":
tokenizer, model = load_hf_model(args.model_name)
def llm_func(prompt):
return generate_hf(tokenizer, model, prompt)
else: # vllm
llm, sampling_params = load_vllm_model(args.model_name)
def llm_func(prompt):
return generate_vllm(llm, sampling_params, prompt)
# Simple generation evaluation
queries = [item["question"] for item in data[:test_samples]]
gen_results = evaluate_rag(
evaluator.searcher,
llm_func,
queries,
domain="finance",
complexity=complexity,
)
timing_metrics = {
"total_questions": len(queries),
"avg_search_time": gen_results["avg_search_time"],
"avg_generation_time": gen_results["avg_generation_time"],
"results": gen_results["results"],
}
evaluation_time = time.time()
except Exception as e:
print(f"❌ Generation evaluation failed: {e}")
timing_metrics = {
"total_questions": 0,
"avg_search_time": 0,
"avg_generation_time": 0,
}
evaluation_time = 0
# Combine all metrics
combined_metrics = {
@@ -849,8 +872,11 @@ def main():
"storage_saving_percent": storage_saving,
}
# Print comprehensive results
evaluator._print_results(combined_metrics)
# Print results
print("\n📊 Generation Results:")
print(f" Total Questions: {timing_metrics.get('total_questions', 0)}")
print(f" Avg Search Time: {timing_metrics.get('avg_search_time', 0):.3f}s")
print(f" Avg Generation Time: {timing_metrics.get('avg_generation_time', 0):.3f}s")
# Save results if requested
if args.output: