Compare commits
7 Commits
fix/pdf-du
...
feat/multi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5637270f02 | ||
|
|
1c690e4a8a | ||
|
|
07afe546ea | ||
|
|
ae3b8af3df | ||
|
|
a9c014df9e | ||
|
|
3766ad1fd2 | ||
|
|
c3aceed1e0 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -91,7 +91,8 @@ packages/leann-backend-diskann/third_party/DiskANN/_deps/
|
||||
|
||||
*.meta.json
|
||||
*.passages.json
|
||||
|
||||
*.npy
|
||||
*.db
|
||||
batchtest.py
|
||||
tests/__pytest_cache__/
|
||||
tests/__pycache__/
|
||||
|
||||
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
try:
|
||||
from leann.chunking_utils import (
|
||||
CODE_EXTENSIONS,
|
||||
_traditional_chunks_as_dicts,
|
||||
create_ast_chunks,
|
||||
create_text_chunks,
|
||||
create_traditional_chunks,
|
||||
@@ -25,6 +26,7 @@ except Exception: # pragma: no cover - best-effort fallback for dev environment
|
||||
sys.path.insert(0, str(leann_src))
|
||||
from leann.chunking_utils import (
|
||||
CODE_EXTENSIONS,
|
||||
_traditional_chunks_as_dicts,
|
||||
create_ast_chunks,
|
||||
create_text_chunks,
|
||||
create_traditional_chunks,
|
||||
@@ -36,6 +38,7 @@ except Exception: # pragma: no cover - best-effort fallback for dev environment
|
||||
|
||||
__all__ = [
|
||||
"CODE_EXTENSIONS",
|
||||
"_traditional_chunks_as_dicts",
|
||||
"create_ast_chunks",
|
||||
"create_text_chunks",
|
||||
"create_traditional_chunks",
|
||||
|
||||
132
apps/multimodal/vision-based-pdf-multi-vector/colqwen_forward.py
Executable file
132
apps/multimodal/vision-based-pdf-multi-vector/colqwen_forward.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Simple test script to test colqwen2 forward pass with a single image."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add the current directory to path to import leann_multi_vector
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
import torch
|
||||
from leann_multi_vector import _embed_images, _ensure_repo_paths_importable, _load_colvision
|
||||
from PIL import Image
|
||||
|
||||
# Ensure repo paths are importable
|
||||
_ensure_repo_paths_importable(__file__)
|
||||
|
||||
# Set environment variable
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
|
||||
def create_test_image():
|
||||
"""Create a simple test image."""
|
||||
# Create a simple RGB image (800x600)
|
||||
img = Image.new("RGB", (800, 600), color="white")
|
||||
return img
|
||||
|
||||
|
||||
def load_test_image_from_file():
|
||||
"""Try to load an image from the indexes directory if available."""
|
||||
# Try to find an existing image in the indexes directory
|
||||
indexes_dir = Path(__file__).parent / "indexes"
|
||||
|
||||
# Look for images in common locations
|
||||
possible_paths = [
|
||||
indexes_dir / "vidore_fastplaid" / "images",
|
||||
indexes_dir / "colvision_large.leann.images",
|
||||
indexes_dir / "colvision.leann.images",
|
||||
]
|
||||
|
||||
for img_dir in possible_paths:
|
||||
if img_dir.exists():
|
||||
# Find first image file
|
||||
for ext in [".png", ".jpg", ".jpeg"]:
|
||||
for img_file in img_dir.glob(f"*{ext}"):
|
||||
print(f"Loading test image from: {img_file}")
|
||||
return Image.open(img_file)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Testing ColQwen2 Forward Pass")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Load or create test image
|
||||
print("\n[Step 1] Loading test image...")
|
||||
test_image = load_test_image_from_file()
|
||||
if test_image is None:
|
||||
print("No existing image found, creating a simple test image...")
|
||||
test_image = create_test_image()
|
||||
else:
|
||||
print(f"✓ Loaded image: {test_image.size} ({test_image.mode})")
|
||||
|
||||
# Convert to RGB if needed
|
||||
if test_image.mode != "RGB":
|
||||
test_image = test_image.convert("RGB")
|
||||
print(f"✓ Converted to RGB: {test_image.size}")
|
||||
|
||||
# Step 2: Load model
|
||||
print("\n[Step 2] Loading ColQwen2 model...")
|
||||
try:
|
||||
model_name, model, processor, device_str, device, dtype = _load_colvision("colqwen2")
|
||||
print(f"✓ Model loaded: {model_name}")
|
||||
print(f"✓ Device: {device_str}, dtype: {dtype}")
|
||||
|
||||
# Print model info
|
||||
if hasattr(model, "device"):
|
||||
print(f"✓ Model device: {model.device}")
|
||||
if hasattr(model, "dtype"):
|
||||
print(f"✓ Model dtype: {model.dtype}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading model: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Step 3: Test forward pass
|
||||
print("\n[Step 3] Running forward pass...")
|
||||
try:
|
||||
# Use the _embed_images function which handles batching and forward pass
|
||||
images = [test_image]
|
||||
print(f"Processing {len(images)} image(s)...")
|
||||
|
||||
doc_vecs = _embed_images(model, processor, images)
|
||||
|
||||
print("✓ Forward pass completed!")
|
||||
print(f"✓ Number of embeddings: {len(doc_vecs)}")
|
||||
|
||||
if len(doc_vecs) > 0:
|
||||
emb = doc_vecs[0]
|
||||
print(f"✓ Embedding shape: {emb.shape}")
|
||||
print(f"✓ Embedding dtype: {emb.dtype}")
|
||||
print("✓ Embedding stats:")
|
||||
print(f" - Min: {emb.min().item():.4f}")
|
||||
print(f" - Max: {emb.max().item():.4f}")
|
||||
print(f" - Mean: {emb.mean().item():.4f}")
|
||||
print(f" - Std: {emb.std().item():.4f}")
|
||||
|
||||
# Check for NaN or Inf
|
||||
if torch.isnan(emb).any():
|
||||
print("⚠ Warning: Embedding contains NaN values!")
|
||||
if torch.isinf(emb).any():
|
||||
print("⚠ Warning: Embedding contains Inf values!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error during forward pass: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Test completed successfully!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,42 +1,75 @@
|
||||
## Jupyter-style notebook script
|
||||
# %%
|
||||
# uv pip install matplotlib qwen_vl_utils
|
||||
import argparse
|
||||
import faulthandler
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, cast
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
|
||||
# Enable faulthandler to get stack trace on segfault
|
||||
faulthandler.enable()
|
||||
|
||||
def _ensure_repo_paths_importable(current_file: str) -> None:
|
||||
"""Make local leann packages importable without installing (mirrors multi-vector-leann.py)."""
|
||||
_repo_root = Path(current_file).resolve().parents[3]
|
||||
_leann_core_src = _repo_root / "packages" / "leann-core" / "src"
|
||||
_leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
|
||||
if str(_leann_core_src) not in sys.path:
|
||||
sys.path.append(str(_leann_core_src))
|
||||
if str(_leann_hnsw_pkg) not in sys.path:
|
||||
sys.path.append(str(_leann_hnsw_pkg))
|
||||
|
||||
from leann_multi_vector import ( # utility functions/classes
|
||||
_ensure_repo_paths_importable,
|
||||
_load_images_from_dir,
|
||||
_maybe_convert_pdf_to_images,
|
||||
_load_colvision,
|
||||
_embed_images,
|
||||
_embed_queries,
|
||||
_build_index,
|
||||
_load_retriever_if_index_exists,
|
||||
_generate_similarity_map,
|
||||
_build_fast_plaid_index,
|
||||
_load_fast_plaid_index_if_exists,
|
||||
_search_fast_plaid,
|
||||
_get_fast_plaid_image,
|
||||
_get_fast_plaid_metadata,
|
||||
QwenVL,
|
||||
)
|
||||
|
||||
_ensure_repo_paths_importable(__file__)
|
||||
|
||||
from leann_multi_vector import LeannMultiVector # noqa: E402
|
||||
|
||||
# %%
|
||||
# Config
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
QUERY = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
|
||||
QUERY = "The paper talk about the latent video generative model and data curation in the related work part?"
|
||||
MODEL: str = "colqwen2" # "colpali" or "colqwen2"
|
||||
|
||||
# Data source: set to True to use the Hugging Face dataset example (recommended)
|
||||
USE_HF_DATASET: bool = True
|
||||
# Single dataset name (used when DATASET_NAMES is None)
|
||||
DATASET_NAME: str = "weaviate/arXiv-AI-papers-multi-vector"
|
||||
DATASET_SPLIT: str = "train"
|
||||
# Multiple datasets to combine (if provided, DATASET_NAME is ignored)
|
||||
# Can be:
|
||||
# - List of strings: ["dataset1", "dataset2"]
|
||||
# - List of tuples: [("dataset1", "config1"), ("dataset2", None)] # None = no config needed
|
||||
# - Mixed: ["dataset1", ("dataset2", "config2")]
|
||||
#
|
||||
# Some potential datasets with images (may need IMAGE_FIELD_NAME adjustment):
|
||||
# - "weaviate/arXiv-AI-papers-multi-vector" (current, has "page_image" field)
|
||||
# - ("lmms-lab/DocVQA", "DocVQA") (has "image" field, document images, needs config)
|
||||
# - ("lmms-lab/DocVQA", "InfographicVQA") (has "image" field, infographic images)
|
||||
# - "pixparse/arxiv-papers" (if available, arXiv papers)
|
||||
# - "allenai/ai2d" (AI2D diagram dataset, has "image" field)
|
||||
# - "huggingface/document-images" (if available)
|
||||
# Note: Check dataset structure first - some may need IMAGE_FIELD_NAME specified
|
||||
# DATASET_NAMES: Optional[list[str | tuple[str, Optional[str]]]] = None
|
||||
DATASET_NAMES = [
|
||||
"weaviate/arXiv-AI-papers-multi-vector",
|
||||
("lmms-lab/DocVQA", "DocVQA"), # Specify config name for datasets with multiple configs
|
||||
]
|
||||
# Load multiple splits to get more data (e.g., ["train", "test", "validation"])
|
||||
# Set to None to try loading all available splits automatically
|
||||
DATASET_SPLITS: Optional[list[str]] = ["train", "test"] # None = auto-detect all splits
|
||||
# Image field name in the dataset (auto-detect if None)
|
||||
# Common names: "page_image", "image", "images", "img"
|
||||
IMAGE_FIELD_NAME: Optional[str] = None # None = auto-detect
|
||||
MAX_DOCS: Optional[int] = None # limit number of pages to index; None = all
|
||||
|
||||
# Local pages (used when USE_HF_DATASET == False)
|
||||
@@ -44,10 +77,13 @@ PDF: Optional[str] = None # e.g., "./pdfs/2004.12832v2.pdf"
|
||||
PAGES_DIR: str = "./pages"
|
||||
|
||||
# Index + retrieval settings
|
||||
INDEX_PATH: str = "./indexes/colvision.leann"
|
||||
TOPK: int = 1
|
||||
# Use a different index path for larger dataset to avoid overwriting existing index
|
||||
INDEX_PATH: str = "./indexes/colvision_large.leann"
|
||||
# Fast-Plaid index settings (alternative to LEANN index)
|
||||
# These are now command-line arguments (see CLI overrides section)
|
||||
TOPK: int = 3
|
||||
FIRST_STAGE_K: int = 500
|
||||
REBUILD_INDEX: bool = False
|
||||
REBUILD_INDEX: bool = True
|
||||
|
||||
# Artifacts
|
||||
SAVE_TOP_IMAGE: Optional[str] = "./figures/retrieved_page.png"
|
||||
@@ -55,369 +91,517 @@ SIMILARITY_MAP: bool = True
|
||||
SIM_TOKEN_IDX: int = 13 # -1 means auto-select the most salient token
|
||||
SIM_OUTPUT: str = "./figures/similarity_map.png"
|
||||
ANSWER: bool = True
|
||||
MAX_NEW_TOKENS: int = 128
|
||||
MAX_NEW_TOKENS: int = 1024
|
||||
|
||||
|
||||
# %%
|
||||
# Helpers
|
||||
def _natural_sort_key(name: str) -> int:
|
||||
m = re.search(r"\d+", name)
|
||||
return int(m.group()) if m else 0
|
||||
|
||||
|
||||
def _load_images_from_dir(pages_dir: str) -> tuple[list[str], list[Image.Image]]:
|
||||
filenames = [n for n in os.listdir(pages_dir) if n.lower().endswith((".png", ".jpg", ".jpeg"))]
|
||||
filenames = sorted(filenames, key=_natural_sort_key)
|
||||
filepaths = [os.path.join(pages_dir, n) for n in filenames]
|
||||
images = [Image.open(p) for p in filepaths]
|
||||
return filepaths, images
|
||||
|
||||
|
||||
def _maybe_convert_pdf_to_images(pdf_path: Optional[str], pages_dir: str, dpi: int = 200) -> None:
|
||||
if not pdf_path:
|
||||
return
|
||||
os.makedirs(pages_dir, exist_ok=True)
|
||||
try:
|
||||
from pdf2image import convert_from_path
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
"pdf2image is required to convert PDF to images. Install via pip install pdf2image"
|
||||
) from e
|
||||
images = convert_from_path(pdf_path, dpi=dpi)
|
||||
for i, image in enumerate(images):
|
||||
image.save(os.path.join(pages_dir, f"page_{i + 1}.png"), "PNG")
|
||||
|
||||
|
||||
def _select_device_and_dtype():
|
||||
import torch
|
||||
from colpali_engine.utils.torch_utils import get_torch_device
|
||||
|
||||
device_str = (
|
||||
"cuda"
|
||||
if torch.cuda.is_available()
|
||||
else (
|
||||
"mps"
|
||||
if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
|
||||
else "cpu"
|
||||
)
|
||||
)
|
||||
device = get_torch_device(device_str)
|
||||
# Stable dtype selection to avoid NaNs:
|
||||
# - CUDA: prefer bfloat16 if supported, else float16
|
||||
# - MPS: use float32 (fp16 on MPS can produce NaNs in some ops)
|
||||
# - CPU: float32
|
||||
if device_str == "cuda":
|
||||
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
||||
try:
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # Better stability/perf on Ampere+
|
||||
except Exception:
|
||||
pass
|
||||
elif device_str == "mps":
|
||||
dtype = torch.float32
|
||||
else:
|
||||
dtype = torch.float32
|
||||
return device_str, device, dtype
|
||||
|
||||
|
||||
def _load_colvision(model_choice: str):
|
||||
import torch
|
||||
from colpali_engine.models import ColPali, ColQwen2, ColQwen2Processor
|
||||
from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
device_str, device, dtype = _select_device_and_dtype()
|
||||
|
||||
if model_choice == "colqwen2":
|
||||
model_name = "vidore/colqwen2-v1.0"
|
||||
# On CPU/MPS we must avoid flash-attn and stay eager; on CUDA prefer flash-attn if available
|
||||
attn_implementation = (
|
||||
"flash_attention_2"
|
||||
if (device_str == "cuda" and is_flash_attn_2_available())
|
||||
else "eager"
|
||||
)
|
||||
model = ColQwen2.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map=device,
|
||||
attn_implementation=attn_implementation,
|
||||
).eval()
|
||||
processor = ColQwen2Processor.from_pretrained(model_name)
|
||||
else:
|
||||
model_name = "vidore/colpali-v1.2"
|
||||
model = ColPali.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map=device,
|
||||
).eval()
|
||||
processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
|
||||
|
||||
return model_name, model, processor, device_str, device, dtype
|
||||
|
||||
|
||||
def _embed_images(model, processor, images: list[Image.Image]) -> list[Any]:
|
||||
import torch
|
||||
from colpali_engine.utils.torch_utils import ListDataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
# Ensure deterministic eval and autocast for stability
|
||||
model.eval()
|
||||
|
||||
dataloader = DataLoader(
|
||||
dataset=ListDataset[Image.Image](images),
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
collate_fn=lambda x: processor.process_images(x),
|
||||
)
|
||||
|
||||
doc_vecs: list[Any] = []
|
||||
for batch_doc in tqdm(dataloader, desc="Embedding images"):
|
||||
with torch.no_grad():
|
||||
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
|
||||
# autocast on CUDA for bf16/fp16; on CPU/MPS stay in fp32
|
||||
if model.device.type == "cuda":
|
||||
with torch.autocast(
|
||||
device_type="cuda",
|
||||
dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
|
||||
):
|
||||
embeddings_doc = model(**batch_doc)
|
||||
else:
|
||||
embeddings_doc = model(**batch_doc)
|
||||
doc_vecs.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
|
||||
return doc_vecs
|
||||
|
||||
|
||||
def _embed_queries(model, processor, queries: list[str]) -> list[Any]:
|
||||
import torch
|
||||
from colpali_engine.utils.torch_utils import ListDataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
model.eval()
|
||||
|
||||
dataloader = DataLoader(
|
||||
dataset=ListDataset[str](queries),
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
collate_fn=lambda x: processor.process_queries(x),
|
||||
)
|
||||
|
||||
q_vecs: list[Any] = []
|
||||
for batch_query in tqdm(dataloader, desc="Embedding queries"):
|
||||
with torch.no_grad():
|
||||
batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
|
||||
if model.device.type == "cuda":
|
||||
with torch.autocast(
|
||||
device_type="cuda",
|
||||
dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
|
||||
):
|
||||
embeddings_query = model(**batch_query)
|
||||
else:
|
||||
embeddings_query = model(**batch_query)
|
||||
q_vecs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
|
||||
return q_vecs
|
||||
|
||||
|
||||
def _build_index(index_path: str, doc_vecs: list[Any], filepaths: list[str]) -> LeannMultiVector:
|
||||
dim = int(doc_vecs[0].shape[-1])
|
||||
retriever = LeannMultiVector(index_path=index_path, dim=dim)
|
||||
retriever.create_collection()
|
||||
for i, vec in enumerate(doc_vecs):
|
||||
data = {
|
||||
"colbert_vecs": vec.float().numpy(),
|
||||
"doc_id": i,
|
||||
"filepath": filepaths[i],
|
||||
}
|
||||
retriever.insert(data)
|
||||
retriever.create_index()
|
||||
return retriever
|
||||
|
||||
|
||||
def _load_retriever_if_index_exists(index_path: str) -> Optional[LeannMultiVector]:
|
||||
index_base = Path(index_path)
|
||||
# Rough heuristic: index dir exists AND meta+labels files exist
|
||||
meta = index_base.parent / f"{index_base.name}.meta.json"
|
||||
labels = index_base.parent / f"{index_base.name}.labels.json"
|
||||
if index_base.exists() and meta.exists() and labels.exists():
|
||||
try:
|
||||
with open(meta, "r", encoding="utf-8") as f:
|
||||
meta_json = json.load(f)
|
||||
dim = int(meta_json.get("dimensions", 128))
|
||||
except Exception:
|
||||
dim = 128
|
||||
return LeannMultiVector(index_path=index_path, dim=dim)
|
||||
return None
|
||||
|
||||
|
||||
def _generate_similarity_map(
|
||||
model,
|
||||
processor,
|
||||
image: Image.Image,
|
||||
query: str,
|
||||
token_idx: Optional[int] = None,
|
||||
output_path: Optional[str] = None,
|
||||
) -> tuple[int, float]:
|
||||
import torch
|
||||
from colpali_engine.interpretability import (
|
||||
get_similarity_maps_from_embeddings,
|
||||
plot_similarity_map,
|
||||
)
|
||||
|
||||
batch_images = processor.process_images([image]).to(model.device)
|
||||
batch_queries = processor.process_queries([query]).to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
image_embeddings = model.forward(**batch_images)
|
||||
query_embeddings = model.forward(**batch_queries)
|
||||
|
||||
n_patches = processor.get_n_patches(
|
||||
image_size=image.size,
|
||||
spatial_merge_size=getattr(model, "spatial_merge_size", None),
|
||||
)
|
||||
image_mask = processor.get_image_mask(batch_images)
|
||||
|
||||
batched_similarity_maps = get_similarity_maps_from_embeddings(
|
||||
image_embeddings=image_embeddings,
|
||||
query_embeddings=query_embeddings,
|
||||
n_patches=n_patches,
|
||||
image_mask=image_mask,
|
||||
)
|
||||
|
||||
similarity_maps = batched_similarity_maps[0]
|
||||
|
||||
# Determine token index if not provided: choose the token with highest max score
|
||||
if token_idx is None:
|
||||
per_token_max = similarity_maps.view(similarity_maps.shape[0], -1).max(dim=1).values
|
||||
token_idx = int(per_token_max.argmax().item())
|
||||
|
||||
max_sim_score = similarity_maps[token_idx, :, :].max().item()
|
||||
|
||||
if output_path:
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, ax = plot_similarity_map(
|
||||
image=image,
|
||||
similarity_map=similarity_maps[token_idx],
|
||||
figsize=(14, 14),
|
||||
show_colorbar=False,
|
||||
)
|
||||
ax.set_title(f"Token #{token_idx}. MaxSim score: {max_sim_score:.2f}", fontsize=12)
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
plt.savefig(output_path, bbox_inches="tight")
|
||||
plt.close(fig)
|
||||
|
||||
return token_idx, float(max_sim_score)
|
||||
|
||||
|
||||
class QwenVL:
|
||||
def __init__(self, device: str):
|
||||
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "eager"
|
||||
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
torch_dtype="auto",
|
||||
device_map=device,
|
||||
attn_implementation=attn_implementation,
|
||||
)
|
||||
|
||||
min_pixels = 256 * 28 * 28
|
||||
max_pixels = 1280 * 28 * 28
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
|
||||
)
|
||||
|
||||
def answer(self, query: str, images: list[Image.Image], max_new_tokens: int = 128) -> str:
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
from qwen_vl_utils import process_vision_info
|
||||
|
||||
content = []
|
||||
for img in images:
|
||||
buffer = BytesIO()
|
||||
img.save(buffer, format="jpeg")
|
||||
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
content.append({"type": "image", "image": f"data:image;base64,{img_base64}"})
|
||||
content.append({"type": "text", "text": query})
|
||||
messages = [{"role": "user", "content": content}]
|
||||
|
||||
text = self.processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
image_inputs, video_inputs = process_vision_info(messages)
|
||||
inputs = self.processor(
|
||||
text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
|
||||
)
|
||||
inputs = inputs.to(self.model.device)
|
||||
|
||||
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
||||
generated_ids_trimmed = [
|
||||
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||
]
|
||||
return self.processor.batch_decode(
|
||||
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)[0]
|
||||
|
||||
# CLI overrides
|
||||
parser = argparse.ArgumentParser(description="Multi-vector LEANN similarity map demo")
|
||||
parser.add_argument(
|
||||
"--search-method",
|
||||
type=str,
|
||||
choices=["ann", "exact", "exact-all"],
|
||||
default="ann",
|
||||
help="Which search method to use: 'ann' (fast ANN), 'exact' (ANN + exact rerank), or 'exact-all' (exact over all docs).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default=QUERY,
|
||||
help=f"Query string to search for. Default: '{QUERY}'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-fast-plaid",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Set to True to use fast-plaid instead of LEANN. Default: False",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fast-plaid-index-path",
|
||||
type=str,
|
||||
default="./indexes/colvision_fastplaid",
|
||||
help="Path to the Fast-Plaid index. Default: './indexes/colvision_fastplaid'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topk",
|
||||
type=int,
|
||||
default=TOPK,
|
||||
help=f"Number of top results to retrieve. Default: {TOPK}",
|
||||
)
|
||||
cli_args, _unknown = parser.parse_known_args()
|
||||
SEARCH_METHOD: str = cli_args.search_method
|
||||
QUERY = cli_args.query # Override QUERY with CLI argument if provided
|
||||
USE_FAST_PLAID: bool = cli_args.use_fast_plaid
|
||||
FAST_PLAID_INDEX_PATH: str = cli_args.fast_plaid_index_path
|
||||
TOPK: int = cli_args.topk # Override TOPK with CLI argument if provided
|
||||
|
||||
# %%
|
||||
|
||||
# Step 1: Prepare data
|
||||
if USE_HF_DATASET:
|
||||
from datasets import load_dataset
|
||||
# Step 1: Check if we can skip data loading (index already exists)
|
||||
retriever: Optional[Any] = None
|
||||
fast_plaid_index: Optional[Any] = None
|
||||
need_to_build_index = REBUILD_INDEX
|
||||
|
||||
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
|
||||
N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
|
||||
filepaths: list[str] = []
|
||||
images: list[Image.Image] = []
|
||||
for i in tqdm(range(N), desc="Loading dataset", total=N ):
|
||||
p = dataset[i]
|
||||
# Compose a descriptive identifier for printing later
|
||||
identifier = f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
|
||||
print(identifier)
|
||||
filepaths.append(identifier)
|
||||
images.append(p["page_image"]) # PIL Image
|
||||
if USE_FAST_PLAID:
|
||||
# Fast-Plaid index handling
|
||||
if not REBUILD_INDEX:
|
||||
try:
|
||||
fast_plaid_index = _load_fast_plaid_index_if_exists(FAST_PLAID_INDEX_PATH)
|
||||
if fast_plaid_index is not None:
|
||||
print(f"✓ Fast-Plaid index found at {FAST_PLAID_INDEX_PATH}")
|
||||
need_to_build_index = False
|
||||
else:
|
||||
print(f"Fast-Plaid index not found, will build new index")
|
||||
need_to_build_index = True
|
||||
except Exception as e:
|
||||
# If loading fails (e.g., memory error, corrupted index), rebuild
|
||||
print(f"Warning: Failed to load Fast-Plaid index: {e}")
|
||||
print("Will rebuild the index...")
|
||||
need_to_build_index = True
|
||||
fast_plaid_index = None
|
||||
else:
|
||||
print(f"REBUILD_INDEX=True, will rebuild Fast-Plaid index")
|
||||
need_to_build_index = True
|
||||
else:
|
||||
_maybe_convert_pdf_to_images(PDF, PAGES_DIR)
|
||||
filepaths, images = _load_images_from_dir(PAGES_DIR)
|
||||
if not images:
|
||||
raise RuntimeError(
|
||||
f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
|
||||
)
|
||||
# Original LEANN index handling
|
||||
if not REBUILD_INDEX:
|
||||
retriever = _load_retriever_if_index_exists(INDEX_PATH)
|
||||
if retriever is not None:
|
||||
print(f"✓ Index loaded from {INDEX_PATH}")
|
||||
print(f"✓ Images available at: {retriever._images_dir_path()}")
|
||||
need_to_build_index = False
|
||||
else:
|
||||
print(f"Index not found, will build new index")
|
||||
need_to_build_index = True
|
||||
else:
|
||||
print(f"REBUILD_INDEX=True, will rebuild index")
|
||||
need_to_build_index = True
|
||||
|
||||
# Step 2: Load data only if we need to build the index
|
||||
if need_to_build_index:
|
||||
print("Loading dataset...")
|
||||
if USE_HF_DATASET:
|
||||
from datasets import load_dataset, concatenate_datasets, DatasetDict
|
||||
|
||||
# Determine which datasets to load
|
||||
if DATASET_NAMES is not None:
|
||||
dataset_names_to_load = DATASET_NAMES
|
||||
print(f"Loading {len(dataset_names_to_load)} datasets: {dataset_names_to_load}")
|
||||
else:
|
||||
dataset_names_to_load = [DATASET_NAME]
|
||||
print(f"Loading single dataset: {DATASET_NAME}")
|
||||
|
||||
# Load and combine datasets
|
||||
all_datasets_to_concat = []
|
||||
|
||||
for dataset_entry in dataset_names_to_load:
|
||||
# Handle both string and tuple formats
|
||||
if isinstance(dataset_entry, tuple):
|
||||
dataset_name, config_name = dataset_entry
|
||||
else:
|
||||
dataset_name = dataset_entry
|
||||
config_name = None
|
||||
|
||||
print(f"\nProcessing dataset: {dataset_name}" + (f" (config: {config_name})" if config_name else ""))
|
||||
|
||||
# Load dataset to check available splits
|
||||
# If config_name is provided, use it; otherwise try without config
|
||||
try:
|
||||
if config_name:
|
||||
dataset_dict = load_dataset(dataset_name, config_name)
|
||||
else:
|
||||
dataset_dict = load_dataset(dataset_name)
|
||||
except ValueError as e:
|
||||
if "Config name is missing" in str(e):
|
||||
# Try to get available configs and suggest
|
||||
from datasets import get_dataset_config_names
|
||||
try:
|
||||
available_configs = get_dataset_config_names(dataset_name)
|
||||
raise ValueError(
|
||||
f"Dataset '{dataset_name}' requires a config name. "
|
||||
f"Available configs: {available_configs}. "
|
||||
f"Please specify as: ('{dataset_name}', 'config_name')"
|
||||
) from e
|
||||
except Exception:
|
||||
raise ValueError(
|
||||
f"Dataset '{dataset_name}' requires a config name. "
|
||||
f"Please specify as: ('{dataset_name}', 'config_name')"
|
||||
) from e
|
||||
raise
|
||||
|
||||
# Determine which splits to load
|
||||
if DATASET_SPLITS is None:
|
||||
# Auto-detect: try to load all available splits
|
||||
available_splits = list(dataset_dict.keys())
|
||||
print(f" Auto-detected splits: {available_splits}")
|
||||
splits_to_load = available_splits
|
||||
else:
|
||||
splits_to_load = DATASET_SPLITS
|
||||
|
||||
# Load and concatenate multiple splits for this dataset
|
||||
datasets_to_concat = []
|
||||
for split in splits_to_load:
|
||||
if split not in dataset_dict:
|
||||
print(f" Warning: Split '{split}' not found in dataset. Available splits: {list(dataset_dict.keys())}")
|
||||
continue
|
||||
split_dataset = dataset_dict[split]
|
||||
print(f" Loaded split '{split}': {len(split_dataset)} pages")
|
||||
datasets_to_concat.append(split_dataset)
|
||||
|
||||
if not datasets_to_concat:
|
||||
print(f" Warning: No valid splits found for {dataset_name}. Skipping.")
|
||||
continue
|
||||
|
||||
# Concatenate splits for this dataset
|
||||
if len(datasets_to_concat) > 1:
|
||||
combined_dataset = concatenate_datasets(datasets_to_concat)
|
||||
print(f" Concatenated {len(datasets_to_concat)} splits into {len(combined_dataset)} pages")
|
||||
else:
|
||||
combined_dataset = datasets_to_concat[0]
|
||||
|
||||
all_datasets_to_concat.append(combined_dataset)
|
||||
|
||||
if not all_datasets_to_concat:
|
||||
raise RuntimeError("No valid datasets or splits found.")
|
||||
|
||||
# Concatenate all datasets
|
||||
if len(all_datasets_to_concat) > 1:
|
||||
dataset = concatenate_datasets(all_datasets_to_concat)
|
||||
print(f"\nConcatenated {len(all_datasets_to_concat)} datasets into {len(dataset)} total pages")
|
||||
else:
|
||||
dataset = all_datasets_to_concat[0]
|
||||
|
||||
# Apply MAX_DOCS limit if specified
|
||||
N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
|
||||
if N < len(dataset):
|
||||
print(f"Limiting to {N} pages (from {len(dataset)} total)")
|
||||
dataset = dataset.select(range(N))
|
||||
|
||||
# Auto-detect image field name if not specified
|
||||
if IMAGE_FIELD_NAME is None:
|
||||
# Check multiple samples to find the most common image field
|
||||
# (useful when datasets are merged and may have different field names)
|
||||
possible_image_fields = ["page_image", "image", "images", "img", "page", "document_image"]
|
||||
field_counts = {}
|
||||
|
||||
# Check first few samples to find image fields
|
||||
num_samples_to_check = min(10, len(dataset))
|
||||
for sample_idx in range(num_samples_to_check):
|
||||
sample = dataset[sample_idx]
|
||||
for field in possible_image_fields:
|
||||
if field in sample and sample[field] is not None:
|
||||
value = sample[field]
|
||||
if isinstance(value, Image.Image) or (hasattr(value, 'size') and hasattr(value, 'mode')):
|
||||
field_counts[field] = field_counts.get(field, 0) + 1
|
||||
|
||||
# Choose the most common field, or first found if tied
|
||||
if field_counts:
|
||||
image_field = max(field_counts.items(), key=lambda x: x[1])[0]
|
||||
print(f"Auto-detected image field: '{image_field}' (found in {field_counts[image_field]}/{num_samples_to_check} samples)")
|
||||
else:
|
||||
# Fallback: check first sample only
|
||||
sample = dataset[0]
|
||||
image_field = None
|
||||
for field in possible_image_fields:
|
||||
if field in sample:
|
||||
value = sample[field]
|
||||
if isinstance(value, Image.Image) or (hasattr(value, 'size') and hasattr(value, 'mode')):
|
||||
image_field = field
|
||||
break
|
||||
if image_field is None:
|
||||
raise RuntimeError(
|
||||
f"Could not auto-detect image field. Available fields: {list(sample.keys())}. "
|
||||
f"Please specify IMAGE_FIELD_NAME manually."
|
||||
)
|
||||
print(f"Auto-detected image field: '{image_field}'")
|
||||
else:
|
||||
image_field = IMAGE_FIELD_NAME
|
||||
if image_field not in dataset[0]:
|
||||
raise RuntimeError(
|
||||
f"Image field '{image_field}' not found. Available fields: {list(dataset[0].keys())}"
|
||||
)
|
||||
|
||||
filepaths: list[str] = []
|
||||
images: list[Image.Image] = []
|
||||
for i in tqdm(range(len(dataset)), desc="Loading dataset", total=len(dataset)):
|
||||
p = dataset[i]
|
||||
# Try to compose a descriptive identifier
|
||||
# Handle different dataset structures
|
||||
identifier_parts = []
|
||||
|
||||
# Helper function to safely get field value
|
||||
def safe_get(field_name, default=None):
|
||||
if field_name in p and p[field_name] is not None:
|
||||
return p[field_name]
|
||||
return default
|
||||
|
||||
# Try to get various identifier fields
|
||||
if safe_get("paper_arxiv_id"):
|
||||
identifier_parts.append(f"arXiv:{p['paper_arxiv_id']}")
|
||||
if safe_get("paper_title"):
|
||||
identifier_parts.append(f"title:{p['paper_title']}")
|
||||
if safe_get("page_number") is not None:
|
||||
try:
|
||||
identifier_parts.append(f"page:{int(p['page_number'])}")
|
||||
except (ValueError, TypeError):
|
||||
# If conversion fails, use the raw value or skip
|
||||
if p['page_number']:
|
||||
identifier_parts.append(f"page:{p['page_number']}")
|
||||
if safe_get("page_id"):
|
||||
identifier_parts.append(f"id:{p['page_id']}")
|
||||
elif safe_get("questionId"):
|
||||
identifier_parts.append(f"qid:{p['questionId']}")
|
||||
elif safe_get("docId"):
|
||||
identifier_parts.append(f"docId:{p['docId']}")
|
||||
elif safe_get("id"):
|
||||
identifier_parts.append(f"id:{p['id']}")
|
||||
|
||||
# If no identifier parts found, create one from index
|
||||
if identifier_parts:
|
||||
identifier = "|".join(identifier_parts)
|
||||
else:
|
||||
# Create identifier from available fields or index
|
||||
fallback_parts = []
|
||||
# Try common fields that might exist
|
||||
for field in ["ucsf_document_id", "docId", "questionId", "id"]:
|
||||
if safe_get(field):
|
||||
fallback_parts.append(f"{field}:{p[field]}")
|
||||
break
|
||||
if fallback_parts:
|
||||
identifier = "|".join(fallback_parts) + f"|idx:{i}"
|
||||
else:
|
||||
identifier = f"doc_{i}"
|
||||
|
||||
filepaths.append(identifier)
|
||||
|
||||
# Get image - try detected field first, then fallback to other common fields
|
||||
img = None
|
||||
if image_field in p and p[image_field] is not None:
|
||||
img = p[image_field]
|
||||
else:
|
||||
# Fallback: try other common image field names
|
||||
for fallback_field in ["image", "page_image", "images", "img"]:
|
||||
if fallback_field in p and p[fallback_field] is not None:
|
||||
img = p[fallback_field]
|
||||
break
|
||||
|
||||
if img is None:
|
||||
raise RuntimeError(
|
||||
f"No image found for sample {i}. Available fields: {list(p.keys())}. "
|
||||
f"Expected field: {image_field}"
|
||||
)
|
||||
|
||||
# Ensure it's a PIL Image
|
||||
if not isinstance(img, Image.Image):
|
||||
if hasattr(img, 'convert'):
|
||||
img = img.convert('RGB')
|
||||
else:
|
||||
img = Image.fromarray(img) if hasattr(img, '__array__') else Image.open(img)
|
||||
images.append(img)
|
||||
else:
|
||||
_maybe_convert_pdf_to_images(PDF, PAGES_DIR)
|
||||
filepaths, images = _load_images_from_dir(PAGES_DIR)
|
||||
if not images:
|
||||
raise RuntimeError(
|
||||
f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
|
||||
)
|
||||
print(f"Loaded {len(images)} images")
|
||||
|
||||
# Memory check before loading model
|
||||
try:
|
||||
import psutil
|
||||
import torch
|
||||
process = psutil.Process(os.getpid())
|
||||
mem_info = process.memory_info()
|
||||
print(f"Memory usage after loading images: {mem_info.rss / 1024 / 1024 / 1024:.2f} GB")
|
||||
if torch.cuda.is_available():
|
||||
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
print("Skipping dataset loading (using existing index)")
|
||||
filepaths = [] # Not needed when using existing index
|
||||
images = [] # Not needed when using existing index
|
||||
|
||||
|
||||
# %%
|
||||
# Step 2: Load model and processor
|
||||
model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
|
||||
print(f"Using model={model_name}, device={device_str}, dtype={dtype}")
|
||||
# Step 3: Load model and processor (only if we need to build index or perform search)
|
||||
print("Step 3: Loading model and processor...")
|
||||
print(f" Model: {MODEL}")
|
||||
try:
|
||||
import sys
|
||||
print(f" Python version: {sys.version}")
|
||||
print(f" Python executable: {sys.executable}")
|
||||
|
||||
model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
|
||||
print(f"✓ Using model={model_name}, device={device_str}, dtype={dtype}")
|
||||
|
||||
# Memory check after loading model
|
||||
try:
|
||||
import psutil
|
||||
import torch
|
||||
process = psutil.Process(os.getpid())
|
||||
mem_info = process.memory_info()
|
||||
print(f" Memory usage after loading model: {mem_info.rss / 1024 / 1024 / 1024:.2f} GB")
|
||||
if torch.cuda.is_available():
|
||||
print(f" GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
||||
print(f" GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading model: {type(e).__name__}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
# %%
|
||||
# Step 3: Build or load index
|
||||
retriever: Optional[LeannMultiVector] = None
|
||||
if not REBUILD_INDEX:
|
||||
retriever = _load_retriever_if_index_exists(INDEX_PATH)
|
||||
# Step 4: Build index if needed
|
||||
if need_to_build_index:
|
||||
print("Step 4: Building index...")
|
||||
print(f" Number of images: {len(images)}")
|
||||
print(f" Number of filepaths: {len(filepaths)}")
|
||||
|
||||
if retriever is None:
|
||||
doc_vecs = _embed_images(model, processor, images)
|
||||
retriever = _build_index(INDEX_PATH, doc_vecs, filepaths)
|
||||
try:
|
||||
print(" Embedding images...")
|
||||
doc_vecs = _embed_images(model, processor, images)
|
||||
print(f" Embedded {len(doc_vecs)} documents")
|
||||
print(f" First doc vec shape: {doc_vecs[0].shape if len(doc_vecs) > 0 else 'N/A'}")
|
||||
except Exception as e:
|
||||
print(f"Error embedding images: {type(e).__name__}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
if USE_FAST_PLAID:
|
||||
# Build Fast-Plaid index
|
||||
print(" Building Fast-Plaid index...")
|
||||
try:
|
||||
fast_plaid_index, build_secs = _build_fast_plaid_index(
|
||||
FAST_PLAID_INDEX_PATH, doc_vecs, filepaths, images
|
||||
)
|
||||
from pathlib import Path
|
||||
print(f"✓ Fast-Plaid index built in {build_secs:.3f}s")
|
||||
print(f"✓ Index saved to: {FAST_PLAID_INDEX_PATH}")
|
||||
print(f"✓ Images saved to: {Path(FAST_PLAID_INDEX_PATH) / 'images'}")
|
||||
except Exception as e:
|
||||
print(f"Error building Fast-Plaid index: {type(e).__name__}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
finally:
|
||||
# Clear memory
|
||||
print(" Clearing memory...")
|
||||
del images, filepaths, doc_vecs
|
||||
else:
|
||||
# Build original LEANN index
|
||||
try:
|
||||
retriever = _build_index(INDEX_PATH, doc_vecs, filepaths, images)
|
||||
print(f"✓ Index built and images saved to: {retriever._images_dir_path()}")
|
||||
except Exception as e:
|
||||
print(f"Error building LEANN index: {type(e).__name__}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise
|
||||
finally:
|
||||
# Clear memory
|
||||
print(" Clearing memory...")
|
||||
del images, filepaths, doc_vecs
|
||||
|
||||
# Note: Images are now stored separately, retriever/fast_plaid_index will reference them
|
||||
|
||||
|
||||
# %%
|
||||
# Step 4: Embed query and search
|
||||
# Step 5: Embed query and search
|
||||
_t0 = time.perf_counter()
|
||||
q_vec = _embed_queries(model, processor, [QUERY])[0]
|
||||
results = retriever.search(q_vec.float().numpy(), topk=TOPK, first_stage_k=FIRST_STAGE_K)
|
||||
query_embed_secs = time.perf_counter() - _t0
|
||||
|
||||
print(f"[Search] Method: {SEARCH_METHOD}")
|
||||
print(f"[Timing] Query embedding: {query_embed_secs:.3f}s")
|
||||
|
||||
# Run the selected search method and time it
|
||||
if USE_FAST_PLAID:
|
||||
# Fast-Plaid search
|
||||
if fast_plaid_index is None:
|
||||
fast_plaid_index = _load_fast_plaid_index_if_exists(FAST_PLAID_INDEX_PATH)
|
||||
if fast_plaid_index is None:
|
||||
raise RuntimeError(f"Fast-Plaid index not found at {FAST_PLAID_INDEX_PATH}")
|
||||
|
||||
results, search_secs = _search_fast_plaid(fast_plaid_index, q_vec, TOPK)
|
||||
print(f"[Timing] Fast-Plaid Search: {search_secs:.3f}s")
|
||||
else:
|
||||
# Original LEANN search
|
||||
query_np = q_vec.float().numpy()
|
||||
|
||||
if SEARCH_METHOD == "ann":
|
||||
results = retriever.search(query_np, topk=TOPK, first_stage_k=FIRST_STAGE_K)
|
||||
search_secs = time.perf_counter() - _t0
|
||||
print(f"[Timing] Search (ANN): {search_secs:.3f}s (first_stage_k={FIRST_STAGE_K})")
|
||||
elif SEARCH_METHOD == "exact":
|
||||
results = retriever.search_exact(query_np, topk=TOPK, first_stage_k=FIRST_STAGE_K)
|
||||
search_secs = time.perf_counter() - _t0
|
||||
print(f"[Timing] Search (Exact rerank): {search_secs:.3f}s (first_stage_k={FIRST_STAGE_K})")
|
||||
elif SEARCH_METHOD == "exact-all":
|
||||
results = retriever.search_exact_all(query_np, topk=TOPK)
|
||||
search_secs = time.perf_counter() - _t0
|
||||
print(f"[Timing] Search (Exact all): {search_secs:.3f}s")
|
||||
else:
|
||||
results = []
|
||||
if not results:
|
||||
print("No results found.")
|
||||
else:
|
||||
print(f'Top {len(results)} results for query: "{QUERY}"')
|
||||
print("\n[DEBUG] Retrieval details:")
|
||||
top_images: list[Image.Image] = []
|
||||
image_hashes = {} # Track image hashes to detect duplicates
|
||||
|
||||
for rank, (score, doc_id) in enumerate(results, start=1):
|
||||
path = filepaths[doc_id]
|
||||
# For HF dataset, path is a descriptive identifier, not a real file path
|
||||
print(f"{rank}) MaxSim: {score:.4f}, Page: {path}")
|
||||
top_images.append(images[doc_id])
|
||||
# Retrieve image and metadata based on index type
|
||||
if USE_FAST_PLAID:
|
||||
# Fast-Plaid: load image and get metadata
|
||||
image = _get_fast_plaid_image(FAST_PLAID_INDEX_PATH, doc_id)
|
||||
if image is None:
|
||||
print(f"Warning: Could not find image for doc_id {doc_id}")
|
||||
continue
|
||||
|
||||
metadata = _get_fast_plaid_metadata(FAST_PLAID_INDEX_PATH, doc_id)
|
||||
path = metadata.get("filepath", f"doc_{doc_id}") if metadata else f"doc_{doc_id}"
|
||||
top_images.append(image)
|
||||
else:
|
||||
# Original LEANN: retrieve from retriever
|
||||
image = retriever.get_image(doc_id)
|
||||
if image is None:
|
||||
print(f"Warning: Could not retrieve image for doc_id {doc_id}")
|
||||
continue
|
||||
|
||||
metadata = retriever.get_metadata(doc_id)
|
||||
path = metadata.get("filepath", "unknown") if metadata else "unknown"
|
||||
top_images.append(image)
|
||||
|
||||
# Calculate image hash to detect duplicates
|
||||
import hashlib
|
||||
import io
|
||||
# Convert image to bytes for hashing
|
||||
img_bytes = io.BytesIO()
|
||||
image.save(img_bytes, format='PNG')
|
||||
image_bytes = img_bytes.getvalue()
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()[:8]
|
||||
|
||||
# Check if this image was already seen
|
||||
duplicate_info = ""
|
||||
if image_hash in image_hashes:
|
||||
duplicate_info = f" [DUPLICATE of rank {image_hashes[image_hash]}]"
|
||||
else:
|
||||
image_hashes[image_hash] = rank
|
||||
|
||||
# Print detailed information
|
||||
print(f"{rank}) doc_id={doc_id}, MaxSim={score:.4f}, Page={path}, ImageHash={image_hash}{duplicate_info}")
|
||||
if metadata:
|
||||
print(f" Metadata: {metadata}")
|
||||
|
||||
if SAVE_TOP_IMAGE:
|
||||
from pathlib import Path as _Path
|
||||
@@ -430,12 +614,17 @@ else:
|
||||
else:
|
||||
out_path = base / f"retrieved_page_rank{rank}.png"
|
||||
img.save(str(out_path))
|
||||
print(f"Saved retrieved page (rank {rank}) to: {out_path}")
|
||||
# Print the retrieval score (document-level MaxSim) alongside the saved path
|
||||
try:
|
||||
score, _doc_id = results[rank - 1]
|
||||
print(f"Saved retrieved page (rank {rank}) [MaxSim={score:.4f}] to: {out_path}")
|
||||
except Exception:
|
||||
print(f"Saved retrieved page (rank {rank}) to: {out_path}")
|
||||
|
||||
## TODO stange results of second page of DeepSeek-V2 rather than the first page
|
||||
|
||||
# %%
|
||||
# Step 5: Similarity maps for top-K results
|
||||
# Step 6: Similarity maps for top-K results
|
||||
if results and SIMILARITY_MAP:
|
||||
token_idx = None if SIM_TOKEN_IDX < 0 else int(SIM_TOKEN_IDX)
|
||||
from pathlib import Path as _Path
|
||||
@@ -472,9 +661,12 @@ if results and SIMILARITY_MAP:
|
||||
|
||||
|
||||
# %%
|
||||
# Step 6: Optional answer generation
|
||||
# Step 7: Optional answer generation
|
||||
if results and ANSWER:
|
||||
qwen = QwenVL(device=device_str)
|
||||
_t0 = time.perf_counter()
|
||||
response = qwen.answer(QUERY, top_images[:TOPK], max_new_tokens=MAX_NEW_TOKENS)
|
||||
gen_secs = time.perf_counter() - _t0
|
||||
print(f"[Timing] Generation: {gen_secs:.3f}s")
|
||||
print("\nAnswer:")
|
||||
print(response)
|
||||
|
||||
@@ -0,0 +1,399 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Modular script to reproduce NDCG results for ViDoRe v1 benchmark.
|
||||
|
||||
This script uses the interface from leann_multi_vector.py to:
|
||||
1. Download ViDoRe v1 datasets
|
||||
2. Build indexes (LEANN or Fast-Plaid)
|
||||
3. Perform retrieval
|
||||
4. Evaluate using NDCG metrics
|
||||
|
||||
Usage:
|
||||
# Evaluate all ViDoRe v1 tasks
|
||||
python vidore_v1_benchmark.py --model colqwen2 --tasks all
|
||||
|
||||
# Evaluate specific task
|
||||
python vidore_v1_benchmark.py --model colqwen2 --task VidoreArxivQARetrieval
|
||||
|
||||
# Use Fast-Plaid index
|
||||
python vidore_v1_benchmark.py --model colqwen2 --use-fast-plaid --fast-plaid-index-path ./indexes/vidore_fastplaid
|
||||
|
||||
# Rebuild index
|
||||
python vidore_v1_benchmark.py --model colqwen2 --rebuild-index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from datasets import load_dataset
|
||||
from leann_multi_vector import (
|
||||
ViDoReBenchmarkEvaluator,
|
||||
_ensure_repo_paths_importable,
|
||||
)
|
||||
|
||||
_ensure_repo_paths_importable(__file__)
|
||||
|
||||
# ViDoRe v1 task configurations
|
||||
# Prompts match MTEB task metadata prompts
|
||||
VIDORE_V1_TASKS = {
|
||||
"VidoreArxivQARetrieval": {
|
||||
"dataset_path": "vidore/arxivqa_test_subsampled_beir",
|
||||
"revision": "7d94d570960eac2408d3baa7a33f9de4822ae3e4",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreDocVQARetrieval": {
|
||||
"dataset_path": "vidore/docvqa_test_subsampled_beir",
|
||||
"revision": "162ba2fc1a8437eda8b6c37b240bc1c0f0deb092",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreInfoVQARetrieval": {
|
||||
"dataset_path": "vidore/infovqa_test_subsampled_beir",
|
||||
"revision": "b802cc5fd6c605df2d673a963667d74881d2c9a4",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreTabfquadRetrieval": {
|
||||
"dataset_path": "vidore/tabfquad_test_subsampled_beir",
|
||||
"revision": "61a2224bcd29b7b261a4892ff4c8bea353527a31",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreTatdqaRetrieval": {
|
||||
"dataset_path": "vidore/tatdqa_test_beir",
|
||||
"revision": "5feb5630fdff4d8d189ffedb2dba56862fdd45c0",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreShiftProjectRetrieval": {
|
||||
"dataset_path": "vidore/shiftproject_test_beir",
|
||||
"revision": "84a382e05c4473fed9cff2bbae95fe2379416117",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreSyntheticDocQAAIRetrieval": {
|
||||
"dataset_path": "vidore/syntheticDocQA_artificial_intelligence_test_beir",
|
||||
"revision": "2d9ebea5a1c6e9ef4a3b902a612f605dca11261c",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreSyntheticDocQAEnergyRetrieval": {
|
||||
"dataset_path": "vidore/syntheticDocQA_energy_test_beir",
|
||||
"revision": "9935aadbad5c8deec30910489db1b2c7133ae7a7",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreSyntheticDocQAGovernmentReportsRetrieval": {
|
||||
"dataset_path": "vidore/syntheticDocQA_government_reports_test_beir",
|
||||
"revision": "b4909afa930f81282fd20601e860668073ad02aa",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"VidoreSyntheticDocQAHealthcareIndustryRetrieval": {
|
||||
"dataset_path": "vidore/syntheticDocQA_healthcare_industry_test_beir",
|
||||
"revision": "f9e25d5b6e13e1ad9f5c3cce202565031b3ab164",
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_vidore_v1_data(
|
||||
dataset_path: str,
|
||||
revision: Optional[str] = None,
|
||||
split: str = "test",
|
||||
):
|
||||
"""
|
||||
Load ViDoRe v1 dataset.
|
||||
|
||||
Returns:
|
||||
corpus: dict mapping corpus_id to PIL Image
|
||||
queries: dict mapping query_id to query text
|
||||
qrels: dict mapping query_id to dict of {corpus_id: relevance_score}
|
||||
"""
|
||||
print(f"Loading dataset: {dataset_path} (split={split})")
|
||||
|
||||
# Load queries
|
||||
query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
|
||||
|
||||
queries = {}
|
||||
for row in query_ds:
|
||||
query_id = f"query-{split}-{row['query-id']}"
|
||||
queries[query_id] = row["query"]
|
||||
|
||||
# Load corpus (images)
|
||||
corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
|
||||
|
||||
corpus = {}
|
||||
for row in corpus_ds:
|
||||
corpus_id = f"corpus-{split}-{row['corpus-id']}"
|
||||
# Extract image from the dataset row
|
||||
if "image" in row:
|
||||
corpus[corpus_id] = row["image"]
|
||||
elif "page_image" in row:
|
||||
corpus[corpus_id] = row["page_image"]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No image field found in corpus. Available fields: {list(row.keys())}"
|
||||
)
|
||||
|
||||
# Load qrels (relevance judgments)
|
||||
qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
|
||||
|
||||
qrels = {}
|
||||
for row in qrels_ds:
|
||||
query_id = f"query-{split}-{row['query-id']}"
|
||||
corpus_id = f"corpus-{split}-{row['corpus-id']}"
|
||||
if query_id not in qrels:
|
||||
qrels[query_id] = {}
|
||||
qrels[query_id][corpus_id] = int(row["score"])
|
||||
|
||||
print(
|
||||
f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
|
||||
)
|
||||
|
||||
# Filter qrels to only include queries that exist
|
||||
qrels = {qid: rel_docs for qid, rel_docs in qrels.items() if qid in queries}
|
||||
|
||||
# Filter out queries without any relevant documents (matching MTEB behavior)
|
||||
# This is important for correct NDCG calculation
|
||||
qrels_filtered = {qid: rel_docs for qid, rel_docs in qrels.items() if len(rel_docs) > 0}
|
||||
queries_filtered = {
|
||||
qid: query_text for qid, query_text in queries.items() if qid in qrels_filtered
|
||||
}
|
||||
|
||||
print(
|
||||
f"After filtering queries without positives: {len(queries_filtered)} queries, {len(qrels_filtered)} query-relevance mappings"
|
||||
)
|
||||
|
||||
return corpus, queries_filtered, qrels_filtered
|
||||
|
||||
|
||||
def evaluate_task(
|
||||
task_name: str,
|
||||
model_name: str,
|
||||
index_path: str,
|
||||
use_fast_plaid: bool = False,
|
||||
fast_plaid_index_path: Optional[str] = None,
|
||||
rebuild_index: bool = False,
|
||||
top_k: int = 1000,
|
||||
first_stage_k: int = 500,
|
||||
k_values: Optional[list[int]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Evaluate a single ViDoRe v1 task.
|
||||
"""
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Evaluating task: {task_name}")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
# Get task config
|
||||
if task_name not in VIDORE_V1_TASKS:
|
||||
raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V1_TASKS.keys())}")
|
||||
|
||||
task_config = VIDORE_V1_TASKS[task_name]
|
||||
dataset_path = task_config["dataset_path"]
|
||||
revision = task_config["revision"]
|
||||
|
||||
# Load data
|
||||
corpus, queries, qrels = load_vidore_v1_data(
|
||||
dataset_path=dataset_path,
|
||||
revision=revision,
|
||||
split="test",
|
||||
)
|
||||
|
||||
# Initialize k_values if not provided
|
||||
if k_values is None:
|
||||
k_values = [1, 3, 5, 10, 20, 100, 1000]
|
||||
|
||||
# Check if we have any queries
|
||||
if len(queries) == 0:
|
||||
print(f"\nWarning: No queries found for task {task_name}. Skipping evaluation.")
|
||||
# Return zero scores
|
||||
scores = {}
|
||||
for k in k_values:
|
||||
scores[f"ndcg_at_{k}"] = 0.0
|
||||
scores[f"map_at_{k}"] = 0.0
|
||||
scores[f"recall_at_{k}"] = 0.0
|
||||
scores[f"precision_at_{k}"] = 0.0
|
||||
scores[f"mrr_at_{k}"] = 0.0
|
||||
return scores
|
||||
|
||||
# Initialize evaluator
|
||||
evaluator = ViDoReBenchmarkEvaluator(
|
||||
model_name=model_name,
|
||||
use_fast_plaid=use_fast_plaid,
|
||||
top_k=top_k,
|
||||
first_stage_k=first_stage_k,
|
||||
k_values=k_values,
|
||||
)
|
||||
|
||||
# Build or load index
|
||||
index_path_full = index_path if not use_fast_plaid else fast_plaid_index_path
|
||||
if index_path_full is None:
|
||||
index_path_full = f"./indexes/{task_name}_{model_name}"
|
||||
if use_fast_plaid:
|
||||
index_path_full = f"./indexes/{task_name}_{model_name}_fastplaid"
|
||||
|
||||
index_or_retriever, corpus_ids_ordered = evaluator.build_index_from_corpus(
|
||||
corpus=corpus,
|
||||
index_path=index_path_full,
|
||||
rebuild=rebuild_index,
|
||||
)
|
||||
|
||||
# Search queries
|
||||
task_prompt = task_config.get("prompt")
|
||||
results = evaluator.search_queries(
|
||||
queries=queries,
|
||||
corpus_ids=corpus_ids_ordered,
|
||||
index_or_retriever=index_or_retriever,
|
||||
fast_plaid_index_path=fast_plaid_index_path,
|
||||
task_prompt=task_prompt,
|
||||
)
|
||||
|
||||
# Evaluate
|
||||
scores = evaluator.evaluate_results(results, qrels, k_values=k_values)
|
||||
|
||||
# Print results
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Results for {task_name}:")
|
||||
print(f"{'=' * 80}")
|
||||
for metric, value in scores.items():
|
||||
if isinstance(value, (int, float)):
|
||||
print(f" {metric}: {value:.5f}")
|
||||
|
||||
# Save results
|
||||
if output_dir:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
results_file = os.path.join(output_dir, f"{task_name}_results.json")
|
||||
scores_file = os.path.join(output_dir, f"{task_name}_scores.json")
|
||||
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\nSaved results to: {results_file}")
|
||||
|
||||
with open(scores_file, "w") as f:
|
||||
json.dump(scores, f, indent=2)
|
||||
print(f"Saved scores to: {scores_file}")
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Evaluate ViDoRe v1 benchmark using LEANN/Fast-Plaid indexing"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="colqwen2",
|
||||
choices=["colqwen2", "colpali"],
|
||||
help="Model to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Specific task to evaluate (or 'all' for all tasks)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks",
|
||||
type=str,
|
||||
default="all",
|
||||
help="Tasks to evaluate: 'all' or comma-separated list",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to LEANN index (auto-generated if not provided)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-fast-plaid",
|
||||
action="store_true",
|
||||
help="Use Fast-Plaid instead of LEANN",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fast-plaid-index-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to Fast-Plaid index (auto-generated if not provided)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild-index",
|
||||
action="store_true",
|
||||
help="Rebuild index even if it exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-k",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Top-k results to retrieve (MTEB default is max(k_values)=1000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--first-stage-k",
|
||||
type=int,
|
||||
default=500,
|
||||
help="First stage k for LEANN search",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k-values",
|
||||
type=str,
|
||||
default="1,3,5,10,20,100,1000",
|
||||
help="Comma-separated k values for evaluation (e.g., '1,3,5,10,100')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./vidore_v1_results",
|
||||
help="Output directory for results",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse k_values
|
||||
k_values = [int(k.strip()) for k in args.k_values.split(",")]
|
||||
|
||||
# Determine tasks to evaluate
|
||||
if args.task:
|
||||
tasks_to_eval = [args.task]
|
||||
elif args.tasks.lower() == "all":
|
||||
tasks_to_eval = list(VIDORE_V1_TASKS.keys())
|
||||
else:
|
||||
tasks_to_eval = [t.strip() for t in args.tasks.split(",")]
|
||||
|
||||
print(f"Tasks to evaluate: {tasks_to_eval}")
|
||||
|
||||
# Evaluate each task
|
||||
all_scores = {}
|
||||
for task_name in tasks_to_eval:
|
||||
try:
|
||||
scores = evaluate_task(
|
||||
task_name=task_name,
|
||||
model_name=args.model,
|
||||
index_path=args.index_path,
|
||||
use_fast_plaid=args.use_fast_plaid,
|
||||
fast_plaid_index_path=args.fast_plaid_index_path,
|
||||
rebuild_index=args.rebuild_index,
|
||||
top_k=args.top_k,
|
||||
first_stage_k=args.first_stage_k,
|
||||
k_values=k_values,
|
||||
output_dir=args.output_dir,
|
||||
)
|
||||
all_scores[task_name] = scores
|
||||
except Exception as e:
|
||||
print(f"\nError evaluating {task_name}: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
# Print summary
|
||||
if all_scores:
|
||||
print(f"\n{'=' * 80}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 80}")
|
||||
for task_name, scores in all_scores.items():
|
||||
print(f"\n{task_name}:")
|
||||
# Print main metrics
|
||||
for metric in ["ndcg_at_5", "ndcg_at_10", "ndcg_at_100", "map_at_10", "recall_at_10"]:
|
||||
if metric in scores:
|
||||
print(f" {metric}: {scores[metric]:.5f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,439 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Modular script to reproduce NDCG results for ViDoRe v2 benchmark.
|
||||
|
||||
This script uses the interface from leann_multi_vector.py to:
|
||||
1. Download ViDoRe v2 datasets
|
||||
2. Build indexes (LEANN or Fast-Plaid)
|
||||
3. Perform retrieval
|
||||
4. Evaluate using NDCG metrics
|
||||
|
||||
Usage:
|
||||
# Evaluate all ViDoRe v2 tasks
|
||||
python vidore_v2_benchmark.py --model colqwen2 --tasks all
|
||||
|
||||
# Evaluate specific task
|
||||
python vidore_v2_benchmark.py --model colqwen2 --task Vidore2ESGReportsRetrieval
|
||||
|
||||
# Use Fast-Plaid index
|
||||
python vidore_v2_benchmark.py --model colqwen2 --use-fast-plaid --fast-plaid-index-path ./indexes/vidore_fastplaid
|
||||
|
||||
# Rebuild index
|
||||
python vidore_v2_benchmark.py --model colqwen2 --rebuild-index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from datasets import load_dataset
|
||||
from leann_multi_vector import (
|
||||
ViDoReBenchmarkEvaluator,
|
||||
_ensure_repo_paths_importable,
|
||||
)
|
||||
|
||||
_ensure_repo_paths_importable(__file__)
|
||||
|
||||
# Language name to dataset language field value mapping
|
||||
# Dataset uses ISO 639-3 + ISO 15924 format (e.g., "eng-Latn")
|
||||
LANGUAGE_MAPPING = {
|
||||
"english": "eng-Latn",
|
||||
"french": "fra-Latn",
|
||||
"spanish": "spa-Latn",
|
||||
"german": "deu-Latn",
|
||||
}
|
||||
|
||||
# ViDoRe v2 task configurations
|
||||
# Prompts match MTEB task metadata prompts
|
||||
VIDORE_V2_TASKS = {
|
||||
"Vidore2ESGReportsRetrieval": {
|
||||
"dataset_path": "vidore/esg_reports_v2",
|
||||
"revision": "0542c0d03da0ec1c8cbc517c8d78e7e95c75d3d3",
|
||||
"languages": ["french", "spanish", "english", "german"],
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"Vidore2EconomicsReportsRetrieval": {
|
||||
"dataset_path": "vidore/economics_reports_v2",
|
||||
"revision": "b3e3a04b07fbbaffe79be49dabf92f691fbca252",
|
||||
"languages": ["french", "spanish", "english", "german"],
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"Vidore2BioMedicalLecturesRetrieval": {
|
||||
"dataset_path": "vidore/biomedical_lectures_v2",
|
||||
"revision": "a29202f0da409034d651614d87cd8938d254e2ea",
|
||||
"languages": ["french", "spanish", "english", "german"],
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
"Vidore2ESGReportsHLRetrieval": {
|
||||
"dataset_path": "vidore/esg_reports_human_labeled_v2",
|
||||
"revision": "6d467dedb09a75144ede1421747e47cf036857dd",
|
||||
# Note: This dataset doesn't have language filtering - all queries are English
|
||||
"languages": None, # No language filtering needed
|
||||
"prompt": {"query": "Find a screenshot that relevant to the user's question."},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def load_vidore_v2_data(
|
||||
dataset_path: str,
|
||||
revision: Optional[str] = None,
|
||||
split: str = "test",
|
||||
language: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Load ViDoRe v2 dataset.
|
||||
|
||||
Returns:
|
||||
corpus: dict mapping corpus_id to PIL Image
|
||||
queries: dict mapping query_id to query text
|
||||
qrels: dict mapping query_id to dict of {corpus_id: relevance_score}
|
||||
"""
|
||||
print(f"Loading dataset: {dataset_path} (split={split}, language={language})")
|
||||
|
||||
# Load queries
|
||||
query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
|
||||
|
||||
# Check if dataset has language field before filtering
|
||||
has_language_field = len(query_ds) > 0 and "language" in query_ds.column_names
|
||||
|
||||
if language and has_language_field:
|
||||
# Map language name to dataset language field value (e.g., "english" -> "eng-Latn")
|
||||
dataset_language = LANGUAGE_MAPPING.get(language, language)
|
||||
query_ds_filtered = query_ds.filter(lambda x: x.get("language") == dataset_language)
|
||||
# Check if filtering resulted in empty dataset
|
||||
if len(query_ds_filtered) == 0:
|
||||
print(
|
||||
f"Warning: No queries found after filtering by language '{language}' (mapped to '{dataset_language}')."
|
||||
)
|
||||
# Try with original language value (dataset might use simple names like 'english')
|
||||
print(f"Trying with original language value '{language}'...")
|
||||
query_ds_filtered = query_ds.filter(lambda x: x.get("language") == language)
|
||||
if len(query_ds_filtered) == 0:
|
||||
# Try to get a sample to see actual language values
|
||||
try:
|
||||
sample_ds = load_dataset(
|
||||
dataset_path, "queries", split=split, revision=revision
|
||||
)
|
||||
if len(sample_ds) > 0 and "language" in sample_ds.column_names:
|
||||
sample_langs = set(sample_ds["language"])
|
||||
print(f"Available language values in dataset: {sample_langs}")
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
print(
|
||||
f"Found {len(query_ds_filtered)} queries using original language value '{language}'"
|
||||
)
|
||||
query_ds = query_ds_filtered
|
||||
|
||||
queries = {}
|
||||
for row in query_ds:
|
||||
query_id = f"query-{split}-{row['query-id']}"
|
||||
queries[query_id] = row["query"]
|
||||
|
||||
# Load corpus (images)
|
||||
corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
|
||||
|
||||
corpus = {}
|
||||
for row in corpus_ds:
|
||||
corpus_id = f"corpus-{split}-{row['corpus-id']}"
|
||||
# Extract image from the dataset row
|
||||
if "image" in row:
|
||||
corpus[corpus_id] = row["image"]
|
||||
elif "page_image" in row:
|
||||
corpus[corpus_id] = row["page_image"]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"No image field found in corpus. Available fields: {list(row.keys())}"
|
||||
)
|
||||
|
||||
# Load qrels (relevance judgments)
|
||||
qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
|
||||
|
||||
qrels = {}
|
||||
for row in qrels_ds:
|
||||
query_id = f"query-{split}-{row['query-id']}"
|
||||
corpus_id = f"corpus-{split}-{row['corpus-id']}"
|
||||
if query_id not in qrels:
|
||||
qrels[query_id] = {}
|
||||
qrels[query_id][corpus_id] = int(row["score"])
|
||||
|
||||
print(
|
||||
f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
|
||||
)
|
||||
|
||||
# Filter qrels to only include queries that exist
|
||||
qrels = {qid: rel_docs for qid, rel_docs in qrels.items() if qid in queries}
|
||||
|
||||
# Filter out queries without any relevant documents (matching MTEB behavior)
|
||||
# This is important for correct NDCG calculation
|
||||
qrels_filtered = {qid: rel_docs for qid, rel_docs in qrels.items() if len(rel_docs) > 0}
|
||||
queries_filtered = {
|
||||
qid: query_text for qid, query_text in queries.items() if qid in qrels_filtered
|
||||
}
|
||||
|
||||
print(
|
||||
f"After filtering queries without positives: {len(queries_filtered)} queries, {len(qrels_filtered)} query-relevance mappings"
|
||||
)
|
||||
|
||||
return corpus, queries_filtered, qrels_filtered
|
||||
|
||||
|
||||
def evaluate_task(
|
||||
task_name: str,
|
||||
model_name: str,
|
||||
index_path: str,
|
||||
use_fast_plaid: bool = False,
|
||||
fast_plaid_index_path: Optional[str] = None,
|
||||
language: Optional[str] = None,
|
||||
rebuild_index: bool = False,
|
||||
top_k: int = 100,
|
||||
first_stage_k: int = 500,
|
||||
k_values: Optional[list[int]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Evaluate a single ViDoRe v2 task.
|
||||
"""
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Evaluating task: {task_name}")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
# Get task config
|
||||
if task_name not in VIDORE_V2_TASKS:
|
||||
raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V2_TASKS.keys())}")
|
||||
|
||||
task_config = VIDORE_V2_TASKS[task_name]
|
||||
dataset_path = task_config["dataset_path"]
|
||||
revision = task_config["revision"]
|
||||
|
||||
# Determine language
|
||||
if language is None:
|
||||
# Use first language if multiple available
|
||||
languages = task_config.get("languages")
|
||||
if languages is None:
|
||||
# Task doesn't support language filtering (e.g., Vidore2ESGReportsHLRetrieval)
|
||||
language = None
|
||||
elif len(languages) == 1:
|
||||
language = languages[0]
|
||||
else:
|
||||
language = None
|
||||
|
||||
# Initialize k_values if not provided
|
||||
if k_values is None:
|
||||
k_values = [1, 3, 5, 10, 100]
|
||||
|
||||
# Load data
|
||||
corpus, queries, qrels = load_vidore_v2_data(
|
||||
dataset_path=dataset_path,
|
||||
revision=revision,
|
||||
split="test",
|
||||
language=language,
|
||||
)
|
||||
|
||||
# Check if we have any queries
|
||||
if len(queries) == 0:
|
||||
print(
|
||||
f"\nWarning: No queries found for task {task_name} with language {language}. Skipping evaluation."
|
||||
)
|
||||
# Return zero scores
|
||||
scores = {}
|
||||
for k in k_values:
|
||||
scores[f"ndcg_at_{k}"] = 0.0
|
||||
scores[f"map_at_{k}"] = 0.0
|
||||
scores[f"recall_at_{k}"] = 0.0
|
||||
scores[f"precision_at_{k}"] = 0.0
|
||||
scores[f"mrr_at_{k}"] = 0.0
|
||||
return scores
|
||||
|
||||
# Initialize evaluator
|
||||
evaluator = ViDoReBenchmarkEvaluator(
|
||||
model_name=model_name,
|
||||
use_fast_plaid=use_fast_plaid,
|
||||
top_k=top_k,
|
||||
first_stage_k=first_stage_k,
|
||||
k_values=k_values,
|
||||
)
|
||||
|
||||
# Build or load index
|
||||
index_path_full = index_path if not use_fast_plaid else fast_plaid_index_path
|
||||
if index_path_full is None:
|
||||
index_path_full = f"./indexes/{task_name}_{model_name}"
|
||||
if use_fast_plaid:
|
||||
index_path_full = f"./indexes/{task_name}_{model_name}_fastplaid"
|
||||
|
||||
index_or_retriever, corpus_ids_ordered = evaluator.build_index_from_corpus(
|
||||
corpus=corpus,
|
||||
index_path=index_path_full,
|
||||
rebuild=rebuild_index,
|
||||
)
|
||||
|
||||
# Search queries
|
||||
task_prompt = task_config.get("prompt")
|
||||
results = evaluator.search_queries(
|
||||
queries=queries,
|
||||
corpus_ids=corpus_ids_ordered,
|
||||
index_or_retriever=index_or_retriever,
|
||||
fast_plaid_index_path=fast_plaid_index_path,
|
||||
task_prompt=task_prompt,
|
||||
)
|
||||
|
||||
# Evaluate
|
||||
scores = evaluator.evaluate_results(results, qrels, k_values=k_values)
|
||||
|
||||
# Print results
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Results for {task_name}:")
|
||||
print(f"{'=' * 80}")
|
||||
for metric, value in scores.items():
|
||||
if isinstance(value, (int, float)):
|
||||
print(f" {metric}: {value:.5f}")
|
||||
|
||||
# Save results
|
||||
if output_dir:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
results_file = os.path.join(output_dir, f"{task_name}_results.json")
|
||||
scores_file = os.path.join(output_dir, f"{task_name}_scores.json")
|
||||
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\nSaved results to: {results_file}")
|
||||
|
||||
with open(scores_file, "w") as f:
|
||||
json.dump(scores, f, indent=2)
|
||||
print(f"Saved scores to: {scores_file}")
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Evaluate ViDoRe v2 benchmark using LEANN/Fast-Plaid indexing"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="colqwen2",
|
||||
choices=["colqwen2", "colpali"],
|
||||
help="Model to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Specific task to evaluate (or 'all' for all tasks)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks",
|
||||
type=str,
|
||||
default="all",
|
||||
help="Tasks to evaluate: 'all' or comma-separated list",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to LEANN index (auto-generated if not provided)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-fast-plaid",
|
||||
action="store_true",
|
||||
help="Use Fast-Plaid instead of LEANN",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fast-plaid-index-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to Fast-Plaid index (auto-generated if not provided)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild-index",
|
||||
action="store_true",
|
||||
help="Rebuild index even if it exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Language to evaluate (default: first available)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-k",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Top-k results to retrieve",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--first-stage-k",
|
||||
type=int,
|
||||
default=500,
|
||||
help="First stage k for LEANN search",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k-values",
|
||||
type=str,
|
||||
default="1,3,5,10,100",
|
||||
help="Comma-separated k values for evaluation (e.g., '1,3,5,10,100')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./vidore_v2_results",
|
||||
help="Output directory for results",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse k_values
|
||||
k_values = [int(k.strip()) for k in args.k_values.split(",")]
|
||||
|
||||
# Determine tasks to evaluate
|
||||
if args.task:
|
||||
tasks_to_eval = [args.task]
|
||||
elif args.tasks.lower() == "all":
|
||||
tasks_to_eval = list(VIDORE_V2_TASKS.keys())
|
||||
else:
|
||||
tasks_to_eval = [t.strip() for t in args.tasks.split(",")]
|
||||
|
||||
print(f"Tasks to evaluate: {tasks_to_eval}")
|
||||
|
||||
# Evaluate each task
|
||||
all_scores = {}
|
||||
for task_name in tasks_to_eval:
|
||||
try:
|
||||
scores = evaluate_task(
|
||||
task_name=task_name,
|
||||
model_name=args.model,
|
||||
index_path=args.index_path,
|
||||
use_fast_plaid=args.use_fast_plaid,
|
||||
fast_plaid_index_path=args.fast_plaid_index_path,
|
||||
language=args.language,
|
||||
rebuild_index=args.rebuild_index,
|
||||
top_k=args.top_k,
|
||||
first_stage_k=args.first_stage_k,
|
||||
k_values=k_values,
|
||||
output_dir=args.output_dir,
|
||||
)
|
||||
all_scores[task_name] = scores
|
||||
except Exception as e:
|
||||
print(f"\nError evaluating {task_name}: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
# Print summary
|
||||
if all_scores:
|
||||
print(f"\n{'=' * 80}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 80}")
|
||||
for task_name, scores in all_scores.items():
|
||||
print(f"\n{task_name}:")
|
||||
# Print main metrics
|
||||
for metric in ["ndcg_at_5", "ndcg_at_10", "ndcg_at_100", "map_at_10", "recall_at_10"]:
|
||||
if metric in scores:
|
||||
print(f" {metric}: {scores[metric]:.5f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -5,12 +5,15 @@ Packaged within leann-core so installed wheels can import it reliably.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Flag to ensure AST token warning only shown once per session
|
||||
_ast_token_warning_shown = False
|
||||
|
||||
|
||||
def estimate_token_count(text: str) -> int:
|
||||
"""
|
||||
@@ -174,37 +177,44 @@ def create_ast_chunks(
|
||||
max_chunk_size: int = 512,
|
||||
chunk_overlap: int = 64,
|
||||
metadata_template: str = "default",
|
||||
) -> list[str]:
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Create AST-aware chunks from code documents using astchunk.
|
||||
|
||||
Falls back to traditional chunking if astchunk is unavailable.
|
||||
|
||||
Returns:
|
||||
List of dicts with {"text": str, "metadata": dict}
|
||||
"""
|
||||
try:
|
||||
from astchunk import ASTChunkBuilder # optional dependency
|
||||
except ImportError as e:
|
||||
logger.error(f"astchunk not available: {e}")
|
||||
logger.info("Falling back to traditional chunking for code files")
|
||||
return create_traditional_chunks(documents, max_chunk_size, chunk_overlap)
|
||||
return _traditional_chunks_as_dicts(documents, max_chunk_size, chunk_overlap)
|
||||
|
||||
all_chunks = []
|
||||
for doc in documents:
|
||||
language = doc.metadata.get("language")
|
||||
if not language:
|
||||
logger.warning("No language detected; falling back to traditional chunking")
|
||||
all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
|
||||
all_chunks.extend(_traditional_chunks_as_dicts([doc], max_chunk_size, chunk_overlap))
|
||||
continue
|
||||
|
||||
try:
|
||||
# Warn if AST chunk size + overlap might exceed common token limits
|
||||
# Warn once if AST chunk size + overlap might exceed common token limits
|
||||
# Note: Actual truncation happens at embedding time with dynamic model limits
|
||||
global _ast_token_warning_shown
|
||||
estimated_max_tokens = int(
|
||||
(max_chunk_size + chunk_overlap) * 1.2
|
||||
) # Conservative estimate
|
||||
if estimated_max_tokens > 512:
|
||||
if estimated_max_tokens > 512 and not _ast_token_warning_shown:
|
||||
logger.warning(
|
||||
f"AST chunk size ({max_chunk_size}) + overlap ({chunk_overlap}) = {max_chunk_size + chunk_overlap} chars "
|
||||
f"may exceed 512 token limit (~{estimated_max_tokens} tokens estimated). "
|
||||
f"Consider reducing --ast-chunk-size to {int(400 / 1.2)} or --ast-chunk-overlap to {int(50 / 1.2)}"
|
||||
f"Consider reducing --ast-chunk-size to {int(400 / 1.2)} or --ast-chunk-overlap to {int(50 / 1.2)}. "
|
||||
f"Note: Chunks will be auto-truncated at embedding time based on your model's actual token limit."
|
||||
)
|
||||
_ast_token_warning_shown = True
|
||||
|
||||
configs = {
|
||||
"max_chunk_size": max_chunk_size,
|
||||
@@ -229,17 +239,40 @@ def create_ast_chunks(
|
||||
|
||||
chunks = chunk_builder.chunkify(code_content)
|
||||
for chunk in chunks:
|
||||
chunk_text = None
|
||||
astchunk_metadata = {}
|
||||
|
||||
if hasattr(chunk, "text"):
|
||||
chunk_text = chunk.text
|
||||
elif isinstance(chunk, dict) and "text" in chunk:
|
||||
chunk_text = chunk["text"]
|
||||
elif isinstance(chunk, str):
|
||||
chunk_text = chunk
|
||||
elif isinstance(chunk, dict):
|
||||
# Handle astchunk format: {"content": "...", "metadata": {...}}
|
||||
if "content" in chunk:
|
||||
chunk_text = chunk["content"]
|
||||
astchunk_metadata = chunk.get("metadata", {})
|
||||
elif "text" in chunk:
|
||||
chunk_text = chunk["text"]
|
||||
else:
|
||||
chunk_text = str(chunk) # Last resort
|
||||
else:
|
||||
chunk_text = str(chunk)
|
||||
|
||||
if chunk_text and chunk_text.strip():
|
||||
all_chunks.append(chunk_text.strip())
|
||||
# Extract document-level metadata
|
||||
doc_metadata = {
|
||||
"file_path": doc.metadata.get("file_path", ""),
|
||||
"file_name": doc.metadata.get("file_name", ""),
|
||||
}
|
||||
if "creation_date" in doc.metadata:
|
||||
doc_metadata["creation_date"] = doc.metadata["creation_date"]
|
||||
if "last_modified_date" in doc.metadata:
|
||||
doc_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
|
||||
|
||||
# Merge document metadata + astchunk metadata
|
||||
combined_metadata = {**doc_metadata, **astchunk_metadata}
|
||||
|
||||
all_chunks.append({"text": chunk_text.strip(), "metadata": combined_metadata})
|
||||
|
||||
logger.info(
|
||||
f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}"
|
||||
@@ -247,15 +280,19 @@ def create_ast_chunks(
|
||||
except Exception as e:
|
||||
logger.warning(f"AST chunking failed for {language} file: {e}")
|
||||
logger.info("Falling back to traditional chunking")
|
||||
all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
|
||||
all_chunks.extend(_traditional_chunks_as_dicts([doc], max_chunk_size, chunk_overlap))
|
||||
|
||||
return all_chunks
|
||||
|
||||
|
||||
def create_traditional_chunks(
|
||||
documents, chunk_size: int = 256, chunk_overlap: int = 128
|
||||
) -> list[str]:
|
||||
"""Create traditional text chunks using LlamaIndex SentenceSplitter."""
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Create traditional text chunks using LlamaIndex SentenceSplitter.
|
||||
|
||||
Returns:
|
||||
List of dicts with {"text": str, "metadata": dict}
|
||||
"""
|
||||
if chunk_size <= 0:
|
||||
logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256")
|
||||
chunk_size = 256
|
||||
@@ -271,19 +308,40 @@ def create_traditional_chunks(
|
||||
paragraph_separator="\n\n",
|
||||
)
|
||||
|
||||
all_texts = []
|
||||
result = []
|
||||
for doc in documents:
|
||||
# Extract document-level metadata
|
||||
doc_metadata = {
|
||||
"file_path": doc.metadata.get("file_path", ""),
|
||||
"file_name": doc.metadata.get("file_name", ""),
|
||||
}
|
||||
if "creation_date" in doc.metadata:
|
||||
doc_metadata["creation_date"] = doc.metadata["creation_date"]
|
||||
if "last_modified_date" in doc.metadata:
|
||||
doc_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
|
||||
|
||||
try:
|
||||
nodes = node_parser.get_nodes_from_documents([doc])
|
||||
if nodes:
|
||||
all_texts.extend(node.get_content() for node in nodes)
|
||||
for node in nodes:
|
||||
result.append({"text": node.get_content(), "metadata": doc_metadata})
|
||||
except Exception as e:
|
||||
logger.error(f"Traditional chunking failed for document: {e}")
|
||||
content = doc.get_content()
|
||||
if content and content.strip():
|
||||
all_texts.append(content.strip())
|
||||
result.append({"text": content.strip(), "metadata": doc_metadata})
|
||||
|
||||
return all_texts
|
||||
return result
|
||||
|
||||
|
||||
def _traditional_chunks_as_dicts(
|
||||
documents, chunk_size: int = 256, chunk_overlap: int = 128
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Helper: Traditional chunking that returns dict format for consistency.
|
||||
|
||||
This is now just an alias for create_traditional_chunks for backwards compatibility.
|
||||
"""
|
||||
return create_traditional_chunks(documents, chunk_size, chunk_overlap)
|
||||
|
||||
|
||||
def create_text_chunks(
|
||||
@@ -295,8 +353,12 @@ def create_text_chunks(
|
||||
ast_chunk_overlap: int = 64,
|
||||
code_file_extensions: Optional[list[str]] = None,
|
||||
ast_fallback_traditional: bool = True,
|
||||
) -> list[str]:
|
||||
"""Create text chunks from documents with optional AST support for code files."""
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Create text chunks from documents with optional AST support for code files.
|
||||
|
||||
Returns:
|
||||
List of dicts with {"text": str, "metadata": dict}
|
||||
"""
|
||||
if not documents:
|
||||
logger.warning("No documents provided for chunking")
|
||||
return []
|
||||
@@ -331,24 +393,17 @@ def create_text_chunks(
|
||||
logger.error(f"AST chunking failed: {e}")
|
||||
if ast_fallback_traditional:
|
||||
all_chunks.extend(
|
||||
create_traditional_chunks(code_docs, chunk_size, chunk_overlap)
|
||||
_traditional_chunks_as_dicts(code_docs, chunk_size, chunk_overlap)
|
||||
)
|
||||
else:
|
||||
raise
|
||||
if text_docs:
|
||||
all_chunks.extend(create_traditional_chunks(text_docs, chunk_size, chunk_overlap))
|
||||
all_chunks.extend(_traditional_chunks_as_dicts(text_docs, chunk_size, chunk_overlap))
|
||||
else:
|
||||
all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap)
|
||||
all_chunks = _traditional_chunks_as_dicts(documents, chunk_size, chunk_overlap)
|
||||
|
||||
logger.info(f"Total chunks created: {len(all_chunks)}")
|
||||
|
||||
# Validate chunk token limits (default to 512 for safety)
|
||||
# This provides a safety net for embedding models with token limits
|
||||
validated_chunks, num_truncated = validate_chunk_token_limits(all_chunks, max_tokens=512)
|
||||
|
||||
if num_truncated > 0:
|
||||
logger.info(
|
||||
f"Post-chunking validation: {num_truncated} chunks were truncated to fit 512 token limit"
|
||||
)
|
||||
|
||||
return validated_chunks
|
||||
# Note: Token truncation is now handled at embedding time with dynamic model limits
|
||||
# See get_model_token_limit() and truncate_to_token_limit() in embedding_compute.py
|
||||
return all_chunks
|
||||
|
||||
@@ -1279,13 +1279,8 @@ Examples:
|
||||
ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True),
|
||||
)
|
||||
|
||||
# Note: AST chunking currently returns plain text chunks without metadata
|
||||
# We preserve basic file info by associating chunks with their source documents
|
||||
# For better metadata preservation, documents list order should be maintained
|
||||
for chunk_text in chunk_texts:
|
||||
# TODO: Enhance create_text_chunks to return metadata alongside text
|
||||
# For now, we store chunks with empty metadata
|
||||
all_texts.append({"text": chunk_text, "metadata": {}})
|
||||
# create_text_chunks now returns list[dict] with metadata preserved
|
||||
all_texts.extend(chunk_texts)
|
||||
|
||||
except ImportError as e:
|
||||
print(
|
||||
|
||||
@@ -10,72 +10,63 @@ import time
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
import torch
|
||||
|
||||
from .settings import resolve_ollama_host, resolve_openai_api_key, resolve_openai_base_url
|
||||
|
||||
# Set up logger with proper level
|
||||
logger = logging.getLogger(__name__)
|
||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||
log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
|
||||
logger.setLevel(log_level)
|
||||
|
||||
def truncate_to_token_limit(texts: list[str], max_tokens: int = 512) -> list[str]:
|
||||
"""
|
||||
Truncate texts to token limit using tiktoken or conservative character truncation.
|
||||
|
||||
Args:
|
||||
texts: List of texts to truncate
|
||||
max_tokens: Maximum tokens allowed per text
|
||||
|
||||
Returns:
|
||||
List of truncated texts that should fit within token limit
|
||||
"""
|
||||
try:
|
||||
import tiktoken
|
||||
|
||||
encoder = tiktoken.get_encoding("cl100k_base")
|
||||
truncated = []
|
||||
|
||||
for text in texts:
|
||||
tokens = encoder.encode(text)
|
||||
if len(tokens) > max_tokens:
|
||||
# Truncate to max_tokens and decode back to text
|
||||
truncated_tokens = tokens[:max_tokens]
|
||||
truncated_text = encoder.decode(truncated_tokens)
|
||||
truncated.append(truncated_text)
|
||||
logger.warning(
|
||||
f"Truncated text from {len(tokens)} to {max_tokens} tokens "
|
||||
f"(from {len(text)} to {len(truncated_text)} characters)"
|
||||
)
|
||||
else:
|
||||
truncated.append(text)
|
||||
return truncated
|
||||
|
||||
except ImportError:
|
||||
# Fallback: Conservative character truncation
|
||||
# Assume worst case: 1.5 tokens per character for code content
|
||||
char_limit = int(max_tokens / 1.5)
|
||||
truncated = []
|
||||
|
||||
for text in texts:
|
||||
if len(text) > char_limit:
|
||||
truncated_text = text[:char_limit]
|
||||
truncated.append(truncated_text)
|
||||
logger.warning(
|
||||
f"Truncated text from {len(text)} to {char_limit} characters "
|
||||
f"(conservative estimate for {max_tokens} tokens)"
|
||||
)
|
||||
else:
|
||||
truncated.append(text)
|
||||
return truncated
|
||||
# Token limit registry for embedding models
|
||||
# Used as fallback when dynamic discovery fails (e.g., LM Studio, OpenAI)
|
||||
# Ollama models use dynamic discovery via /api/show
|
||||
EMBEDDING_MODEL_LIMITS = {
|
||||
# Nomic models (common across servers)
|
||||
"nomic-embed-text": 2048, # Corrected from 512 - verified via /api/show
|
||||
"nomic-embed-text-v1.5": 2048,
|
||||
"nomic-embed-text-v2": 512,
|
||||
# Other embedding models
|
||||
"mxbai-embed-large": 512,
|
||||
"all-minilm": 512,
|
||||
"bge-m3": 8192,
|
||||
"snowflake-arctic-embed": 512,
|
||||
# OpenAI models
|
||||
"text-embedding-3-small": 8192,
|
||||
"text-embedding-3-large": 8192,
|
||||
"text-embedding-ada-002": 8192,
|
||||
}
|
||||
|
||||
|
||||
def get_model_token_limit(model_name: str) -> int:
|
||||
def get_model_token_limit(
|
||||
model_name: str,
|
||||
base_url: Optional[str] = None,
|
||||
default: int = 2048,
|
||||
) -> int:
|
||||
"""
|
||||
Get token limit for a given embedding model.
|
||||
Uses hybrid approach: dynamic discovery for Ollama, registry fallback for others.
|
||||
|
||||
Args:
|
||||
model_name: Name of the embedding model
|
||||
base_url: Base URL of the embedding server (for dynamic discovery)
|
||||
default: Default token limit if model not found
|
||||
|
||||
Returns:
|
||||
Token limit for the model, defaults to 512 if unknown
|
||||
Token limit for the model in tokens
|
||||
"""
|
||||
# Try Ollama dynamic discovery if base_url provided
|
||||
if base_url:
|
||||
# Detect Ollama servers by port or "ollama" in URL
|
||||
if "11434" in base_url or "ollama" in base_url.lower():
|
||||
limit = _query_ollama_context_limit(model_name, base_url)
|
||||
if limit:
|
||||
return limit
|
||||
|
||||
# Fallback to known model registry with version handling (from PR #154)
|
||||
# Handle versioned model names (e.g., "nomic-embed-text:latest" -> "nomic-embed-text")
|
||||
base_model_name = model_name.split(":")[0]
|
||||
|
||||
@@ -92,31 +83,111 @@ def get_model_token_limit(model_name: str) -> int:
|
||||
if known_model in base_model_name or base_model_name in known_model:
|
||||
return limit
|
||||
|
||||
# Default to conservative 512 token limit
|
||||
logger.warning(f"Unknown model '{model_name}', using default 512 token limit")
|
||||
return 512
|
||||
# Default fallback
|
||||
logger.warning(f"Unknown model '{model_name}', using default {default} token limit")
|
||||
return default
|
||||
|
||||
|
||||
# Set up logger with proper level
|
||||
logger = logging.getLogger(__name__)
|
||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||
log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
|
||||
logger.setLevel(log_level)
|
||||
def truncate_to_token_limit(texts: list[str], token_limit: int) -> list[str]:
|
||||
"""
|
||||
Truncate texts to fit within token limit using tiktoken.
|
||||
|
||||
Args:
|
||||
texts: List of text strings to truncate
|
||||
token_limit: Maximum number of tokens allowed
|
||||
|
||||
Returns:
|
||||
List of truncated texts (same length as input)
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Use tiktoken with cl100k_base encoding
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
truncated_texts = []
|
||||
truncation_count = 0
|
||||
total_tokens_removed = 0
|
||||
max_original_length = 0
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
tokens = enc.encode(text)
|
||||
original_length = len(tokens)
|
||||
|
||||
if original_length <= token_limit:
|
||||
# Text is within limit, keep as is
|
||||
truncated_texts.append(text)
|
||||
else:
|
||||
# Truncate to token_limit
|
||||
truncated_tokens = tokens[:token_limit]
|
||||
truncated_text = enc.decode(truncated_tokens)
|
||||
truncated_texts.append(truncated_text)
|
||||
|
||||
# Track truncation statistics
|
||||
truncation_count += 1
|
||||
tokens_removed = original_length - token_limit
|
||||
total_tokens_removed += tokens_removed
|
||||
max_original_length = max(max_original_length, original_length)
|
||||
|
||||
# Log individual truncation at WARNING level (first few only)
|
||||
if truncation_count <= 3:
|
||||
logger.warning(
|
||||
f"Text {i + 1} truncated: {original_length} → {token_limit} tokens "
|
||||
f"({tokens_removed} tokens removed)"
|
||||
)
|
||||
elif truncation_count == 4:
|
||||
logger.warning("Further truncation warnings suppressed...")
|
||||
|
||||
# Log summary at INFO level
|
||||
if truncation_count > 0:
|
||||
logger.warning(
|
||||
f"Truncation summary: {truncation_count}/{len(texts)} texts truncated "
|
||||
f"(removed {total_tokens_removed} tokens total, longest was {max_original_length} tokens)"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"No truncation needed - all {len(texts)} texts within {token_limit} token limit"
|
||||
)
|
||||
|
||||
return truncated_texts
|
||||
|
||||
|
||||
def _query_ollama_context_limit(model_name: str, base_url: str) -> Optional[int]:
|
||||
"""
|
||||
Query Ollama /api/show for model context limit.
|
||||
|
||||
Args:
|
||||
model_name: Name of the Ollama model
|
||||
base_url: Base URL of the Ollama server
|
||||
|
||||
Returns:
|
||||
Context limit in tokens if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
f"{base_url}/api/show",
|
||||
json={"name": model_name},
|
||||
timeout=5,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if "model_info" in data:
|
||||
# Look for *.context_length in model_info
|
||||
for key, value in data["model_info"].items():
|
||||
if "context_length" in key and isinstance(value, int):
|
||||
logger.info(f"Detected {model_name} context limit: {value} tokens")
|
||||
return value
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to query Ollama context limit: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# Global model cache to avoid repeated loading
|
||||
_model_cache: dict[str, Any] = {}
|
||||
|
||||
# Known embedding model token limits
|
||||
EMBEDDING_MODEL_LIMITS = {
|
||||
"nomic-embed-text": 512,
|
||||
"nomic-embed-text-v2": 512,
|
||||
"mxbai-embed-large": 512,
|
||||
"all-minilm": 512,
|
||||
"bge-m3": 8192,
|
||||
"snowflake-arctic-embed": 512,
|
||||
# Add more models as needed
|
||||
}
|
||||
|
||||
|
||||
def compute_embeddings(
|
||||
texts: list[str],
|
||||
@@ -814,15 +885,13 @@ def compute_embeddings_ollama(
|
||||
|
||||
logger.info(f"Using batch size: {batch_size} for true batch processing")
|
||||
|
||||
# Get model token limit and apply truncation
|
||||
token_limit = get_model_token_limit(model_name)
|
||||
# Get model token limit and apply truncation before batching
|
||||
token_limit = get_model_token_limit(model_name, base_url=resolved_host)
|
||||
logger.info(f"Model '{model_name}' token limit: {token_limit}")
|
||||
|
||||
# Apply token-aware truncation to all texts
|
||||
truncated_texts = truncate_to_token_limit(texts, token_limit)
|
||||
if len(truncated_texts) != len(texts):
|
||||
logger.error("Truncation failed - text count mismatch")
|
||||
truncated_texts = texts # Fallback to original texts
|
||||
# Apply truncation to all texts before batch processing
|
||||
# Function logs truncation details internally
|
||||
texts = truncate_to_token_limit(texts, token_limit)
|
||||
|
||||
def get_batch_embeddings(batch_texts):
|
||||
"""Get embeddings for a batch of texts using /api/embed endpoint."""
|
||||
@@ -880,12 +949,12 @@ def compute_embeddings_ollama(
|
||||
|
||||
return None, list(range(len(batch_texts)))
|
||||
|
||||
# Process truncated texts in batches
|
||||
# Process texts in batches
|
||||
all_embeddings = []
|
||||
all_failed_indices = []
|
||||
|
||||
# Setup progress bar if needed
|
||||
show_progress = is_build or len(truncated_texts) > 10
|
||||
show_progress = is_build or len(texts) > 10
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm
|
||||
@@ -893,7 +962,7 @@ def compute_embeddings_ollama(
|
||||
show_progress = False
|
||||
|
||||
# Process batches
|
||||
num_batches = (len(truncated_texts) + batch_size - 1) // batch_size
|
||||
num_batches = (len(texts) + batch_size - 1) // batch_size
|
||||
|
||||
if show_progress:
|
||||
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings (batched)")
|
||||
@@ -902,8 +971,8 @@ def compute_embeddings_ollama(
|
||||
|
||||
for batch_idx in batch_iterator:
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, len(truncated_texts))
|
||||
batch_texts = truncated_texts[start_idx:end_idx]
|
||||
end_idx = min(start_idx + batch_size, len(texts))
|
||||
batch_texts = texts[start_idx:end_idx]
|
||||
|
||||
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
|
||||
|
||||
@@ -918,11 +987,11 @@ def compute_embeddings_ollama(
|
||||
|
||||
# Handle failed embeddings
|
||||
if all_failed_indices:
|
||||
if len(all_failed_indices) == len(truncated_texts):
|
||||
if len(all_failed_indices) == len(texts):
|
||||
raise RuntimeError("Failed to compute any embeddings")
|
||||
|
||||
logger.warning(
|
||||
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(truncated_texts)} texts"
|
||||
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
|
||||
)
|
||||
|
||||
# Use zero embeddings as fallback for failed ones
|
||||
|
||||
@@ -57,6 +57,8 @@ dependencies = [
|
||||
"tree-sitter-c-sharp>=0.20.0",
|
||||
"tree-sitter-typescript>=0.20.0",
|
||||
"torchvision>=0.23.0",
|
||||
"einops",
|
||||
"seaborn",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -8,7 +8,7 @@ import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -116,8 +116,10 @@ class TestChunkingFunctions:
|
||||
chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(isinstance(chunk, str) for chunk in chunks)
|
||||
assert all(len(chunk.strip()) > 0 for chunk in chunks)
|
||||
# Traditional chunks now return dict format for consistency
|
||||
assert all(isinstance(chunk, dict) for chunk in chunks)
|
||||
assert all("text" in chunk and "metadata" in chunk for chunk in chunks)
|
||||
assert all(len(chunk["text"].strip()) > 0 for chunk in chunks)
|
||||
|
||||
def test_create_traditional_chunks_empty_docs(self):
|
||||
"""Test traditional chunking with empty documents."""
|
||||
@@ -158,11 +160,22 @@ class Calculator:
|
||||
|
||||
# Should have multiple chunks due to different functions/classes
|
||||
assert len(chunks) > 0
|
||||
assert all(isinstance(chunk, str) for chunk in chunks)
|
||||
assert all(len(chunk.strip()) > 0 for chunk in chunks)
|
||||
# R3: Expect dict format with "text" and "metadata" keys
|
||||
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
||||
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
||||
"Each chunk should have 'text' and 'metadata' keys"
|
||||
)
|
||||
assert all(len(chunk["text"].strip()) > 0 for chunk in chunks), (
|
||||
"Each chunk text should be non-empty"
|
||||
)
|
||||
|
||||
# Check metadata is present
|
||||
assert all("file_path" in chunk["metadata"] for chunk in chunks), (
|
||||
"Each chunk should have file_path metadata"
|
||||
)
|
||||
|
||||
# Check that code structure is somewhat preserved
|
||||
combined_content = " ".join(chunks)
|
||||
combined_content = " ".join([c["text"] for c in chunks])
|
||||
assert "def hello_world" in combined_content
|
||||
assert "class Calculator" in combined_content
|
||||
|
||||
@@ -194,7 +207,11 @@ class Calculator:
|
||||
chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(isinstance(chunk, str) for chunk in chunks)
|
||||
# R3: Traditional chunking should also return dict format for consistency
|
||||
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
||||
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
||||
"Each chunk should have 'text' and 'metadata' keys"
|
||||
)
|
||||
|
||||
def test_create_text_chunks_ast_mode(self):
|
||||
"""Test text chunking in AST mode."""
|
||||
@@ -213,7 +230,11 @@ class Calculator:
|
||||
)
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(isinstance(chunk, str) for chunk in chunks)
|
||||
# R3: AST mode should also return dict format
|
||||
assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
|
||||
assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
|
||||
"Each chunk should have 'text' and 'metadata' keys"
|
||||
)
|
||||
|
||||
def test_create_text_chunks_custom_extensions(self):
|
||||
"""Test text chunking with custom code file extensions."""
|
||||
@@ -353,6 +374,552 @@ class MathUtils:
|
||||
pytest.skip("Test timed out - likely due to model download in CI")
|
||||
|
||||
|
||||
class TestASTContentExtraction:
|
||||
"""Test AST content extraction bug fix.
|
||||
|
||||
These tests verify that astchunk's dict format with 'content' key is handled correctly,
|
||||
and that the extraction logic doesn't fall through to stringifying entire dicts.
|
||||
"""
|
||||
|
||||
def test_extract_content_from_astchunk_dict(self):
|
||||
"""Test that astchunk dict format with 'content' key is handled correctly.
|
||||
|
||||
Bug: Current code checks for chunk["text"] but astchunk returns chunk["content"].
|
||||
This causes fallthrough to str(chunk), stringifying the entire dict.
|
||||
|
||||
This test will FAIL until the bug is fixed because:
|
||||
- Current code will stringify the dict: "{'content': '...', 'metadata': {...}}"
|
||||
- Fixed code should extract just the content value
|
||||
"""
|
||||
# Mock the ASTChunkBuilder class
|
||||
mock_builder = Mock()
|
||||
|
||||
# Astchunk returns this format
|
||||
astchunk_format_chunk = {
|
||||
"content": "def hello():\n print('world')",
|
||||
"metadata": {
|
||||
"filepath": "test.py",
|
||||
"line_count": 2,
|
||||
"start_line_no": 0,
|
||||
"end_line_no": 1,
|
||||
"node_count": 1,
|
||||
},
|
||||
}
|
||||
mock_builder.chunkify.return_value = [astchunk_format_chunk]
|
||||
|
||||
# Create mock document
|
||||
doc = MockDocument(
|
||||
"def hello():\n print('world')", "/test/test.py", {"language": "python"}
|
||||
)
|
||||
|
||||
# Mock the astchunk module and its ASTChunkBuilder class
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
# Patch sys.modules to inject our mock before the import
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
# Call create_ast_chunks
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# R3: Should return dict format with proper metadata
|
||||
assert len(chunks) > 0, "Should return at least one chunk"
|
||||
|
||||
# R3: Each chunk should be a dict
|
||||
chunk = chunks[0]
|
||||
assert isinstance(chunk, dict), "Chunk should be a dict"
|
||||
assert "text" in chunk, "Chunk should have 'text' key"
|
||||
assert "metadata" in chunk, "Chunk should have 'metadata' key"
|
||||
|
||||
chunk_text = chunk["text"]
|
||||
|
||||
# CRITICAL: Should NOT contain stringified dict markers in the text field
|
||||
# These assertions will FAIL with current buggy code
|
||||
assert "'content':" not in chunk_text, (
|
||||
f"Chunk text contains stringified dict - extraction failed! Got: {chunk_text[:100]}..."
|
||||
)
|
||||
assert "'metadata':" not in chunk_text, (
|
||||
"Chunk text contains stringified metadata - extraction failed! "
|
||||
f"Got: {chunk_text[:100]}..."
|
||||
)
|
||||
assert "{" not in chunk_text or "def hello" in chunk_text.split("{")[0], (
|
||||
"Chunk text appears to be a stringified dict"
|
||||
)
|
||||
|
||||
# Should contain actual content
|
||||
assert "def hello()" in chunk_text, "Should extract actual code content"
|
||||
assert "print('world')" in chunk_text, "Should extract complete code content"
|
||||
|
||||
# R3: Should preserve astchunk metadata
|
||||
assert "filepath" in chunk["metadata"] or "file_path" in chunk["metadata"], (
|
||||
"Should preserve file path metadata"
|
||||
)
|
||||
|
||||
def test_extract_text_key_fallback(self):
|
||||
"""Test that 'text' key still works for backward compatibility.
|
||||
|
||||
Some chunks might use 'text' instead of 'content' - ensure backward compatibility.
|
||||
This test should PASS even with current code.
|
||||
"""
|
||||
mock_builder = Mock()
|
||||
|
||||
# Some chunks might use "text" key
|
||||
text_key_chunk = {"text": "def legacy_function():\n return True"}
|
||||
mock_builder.chunkify.return_value = [text_key_chunk]
|
||||
|
||||
# Create mock document
|
||||
doc = MockDocument(
|
||||
"def legacy_function():\n return True", "/test/legacy.py", {"language": "python"}
|
||||
)
|
||||
|
||||
# Mock the astchunk module
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
# Call create_ast_chunks
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# R3: Should extract text correctly as dict format
|
||||
assert len(chunks) > 0
|
||||
chunk = chunks[0]
|
||||
assert isinstance(chunk, dict), "Chunk should be a dict"
|
||||
assert "text" in chunk, "Chunk should have 'text' key"
|
||||
|
||||
chunk_text = chunk["text"]
|
||||
|
||||
# Should NOT be stringified
|
||||
assert "'text':" not in chunk_text, "Should not stringify dict with 'text' key"
|
||||
|
||||
# Should contain actual content
|
||||
assert "def legacy_function()" in chunk_text
|
||||
assert "return True" in chunk_text
|
||||
|
||||
def test_handles_string_chunks(self):
|
||||
"""Test that plain string chunks still work.
|
||||
|
||||
Some chunkers might return plain strings - verify these are preserved.
|
||||
This test should PASS with current code.
|
||||
"""
|
||||
mock_builder = Mock()
|
||||
|
||||
# Plain string chunk
|
||||
plain_string_chunk = "def simple_function():\n pass"
|
||||
mock_builder.chunkify.return_value = [plain_string_chunk]
|
||||
|
||||
# Create mock document
|
||||
doc = MockDocument(
|
||||
"def simple_function():\n pass", "/test/simple.py", {"language": "python"}
|
||||
)
|
||||
|
||||
# Mock the astchunk module
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
# Call create_ast_chunks
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# R3: Should wrap string in dict format
|
||||
assert len(chunks) > 0
|
||||
chunk = chunks[0]
|
||||
assert isinstance(chunk, dict), "Even string chunks should be wrapped in dict"
|
||||
assert "text" in chunk, "Chunk should have 'text' key"
|
||||
|
||||
chunk_text = chunk["text"]
|
||||
|
||||
assert chunk_text == plain_string_chunk.strip(), (
|
||||
"Should preserve plain string chunk content"
|
||||
)
|
||||
assert "def simple_function()" in chunk_text
|
||||
assert "pass" in chunk_text
|
||||
|
||||
def test_multiple_chunks_with_mixed_formats(self):
|
||||
"""Test handling of multiple chunks with different formats.
|
||||
|
||||
Real-world scenario: astchunk might return a mix of formats.
|
||||
This test will FAIL if any chunk with 'content' key gets stringified.
|
||||
"""
|
||||
mock_builder = Mock()
|
||||
|
||||
# Mix of formats
|
||||
mixed_chunks = [
|
||||
{"content": "def first():\n return 1", "metadata": {"line_count": 2}},
|
||||
"def second():\n return 2", # Plain string
|
||||
{"text": "def third():\n return 3"}, # Old format
|
||||
{"content": "class MyClass:\n pass", "metadata": {"node_count": 1}},
|
||||
]
|
||||
mock_builder.chunkify.return_value = mixed_chunks
|
||||
|
||||
# Create mock document
|
||||
code = "def first():\n return 1\n\ndef second():\n return 2\n\ndef third():\n return 3\n\nclass MyClass:\n pass"
|
||||
doc = MockDocument(code, "/test/mixed.py", {"language": "python"})
|
||||
|
||||
# Mock the astchunk module
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
# Call create_ast_chunks
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# R3: Should extract all chunks correctly as dicts
|
||||
assert len(chunks) == 4, "Should extract all 4 chunks"
|
||||
|
||||
# Check each chunk
|
||||
for i, chunk in enumerate(chunks):
|
||||
assert isinstance(chunk, dict), f"Chunk {i} should be a dict"
|
||||
assert "text" in chunk, f"Chunk {i} should have 'text' key"
|
||||
assert "metadata" in chunk, f"Chunk {i} should have 'metadata' key"
|
||||
|
||||
chunk_text = chunk["text"]
|
||||
# None should be stringified dicts
|
||||
assert "'content':" not in chunk_text, f"Chunk {i} text is stringified (has 'content':)"
|
||||
assert "'metadata':" not in chunk_text, (
|
||||
f"Chunk {i} text is stringified (has 'metadata':)"
|
||||
)
|
||||
assert "'text':" not in chunk_text, f"Chunk {i} text is stringified (has 'text':)"
|
||||
|
||||
# Verify actual content is present
|
||||
combined = "\n".join([c["text"] for c in chunks])
|
||||
assert "def first()" in combined
|
||||
assert "def second()" in combined
|
||||
assert "def third()" in combined
|
||||
assert "class MyClass:" in combined
|
||||
|
||||
def test_empty_content_value_handling(self):
|
||||
"""Test handling of chunks with empty content values.
|
||||
|
||||
Edge case: chunk has 'content' key but value is empty.
|
||||
Should skip these chunks, not stringify them.
|
||||
"""
|
||||
mock_builder = Mock()
|
||||
|
||||
chunks_with_empty = [
|
||||
{"content": "", "metadata": {"line_count": 0}}, # Empty content
|
||||
{"content": " ", "metadata": {"line_count": 1}}, # Whitespace only
|
||||
{"content": "def valid():\n return True", "metadata": {"line_count": 2}}, # Valid
|
||||
]
|
||||
mock_builder.chunkify.return_value = chunks_with_empty
|
||||
|
||||
doc = MockDocument(
|
||||
"def valid():\n return True", "/test/empty.py", {"language": "python"}
|
||||
)
|
||||
|
||||
# Mock the astchunk module
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# R3: Should only have the valid chunk (empty ones filtered out)
|
||||
assert len(chunks) == 1, "Should filter out empty content chunks"
|
||||
|
||||
chunk = chunks[0]
|
||||
assert isinstance(chunk, dict), "Chunk should be a dict"
|
||||
assert "text" in chunk, "Chunk should have 'text' key"
|
||||
assert "def valid()" in chunk["text"]
|
||||
|
||||
# Should not have stringified the empty dict
|
||||
assert "'content': ''" not in chunk["text"]
|
||||
|
||||
|
||||
class TestASTMetadataPreservation:
|
||||
"""Test metadata preservation in AST chunk dictionaries.
|
||||
|
||||
R3: These tests define the contract for metadata preservation when returning
|
||||
chunk dictionaries instead of plain strings. Each chunk dict should have:
|
||||
- "text": str - the actual chunk content
|
||||
- "metadata": dict - all metadata from document AND astchunk
|
||||
|
||||
These tests will FAIL until G3 implementation changes return type to list[dict].
|
||||
"""
|
||||
|
||||
def test_ast_chunks_preserve_file_metadata(self):
|
||||
"""Test that document metadata is preserved in chunk metadata.
|
||||
|
||||
This test verifies that all document-level metadata (file_path, file_name,
|
||||
creation_date, last_modified_date) is included in each chunk's metadata dict.
|
||||
|
||||
This will FAIL because current code returns list[str], not list[dict].
|
||||
"""
|
||||
# Create mock document with rich metadata
|
||||
python_code = '''
|
||||
def calculate_sum(numbers):
|
||||
"""Calculate sum of numbers."""
|
||||
return sum(numbers)
|
||||
|
||||
class DataProcessor:
|
||||
"""Process data records."""
|
||||
|
||||
def process(self, data):
|
||||
return [x * 2 for x in data]
|
||||
'''
|
||||
doc = MockDocument(
|
||||
python_code,
|
||||
file_path="/project/src/utils.py",
|
||||
metadata={
|
||||
"language": "python",
|
||||
"file_path": "/project/src/utils.py",
|
||||
"file_name": "utils.py",
|
||||
"creation_date": "2024-01-15T10:30:00",
|
||||
"last_modified_date": "2024-10-31T15:45:00",
|
||||
},
|
||||
)
|
||||
|
||||
# Mock astchunk to return chunks with metadata
|
||||
mock_builder = Mock()
|
||||
astchunk_chunks = [
|
||||
{
|
||||
"content": "def calculate_sum(numbers):\n return sum(numbers)",
|
||||
"metadata": {
|
||||
"filepath": "/project/src/utils.py",
|
||||
"line_count": 2,
|
||||
"start_line_no": 1,
|
||||
"end_line_no": 2,
|
||||
"node_count": 1,
|
||||
},
|
||||
},
|
||||
{
|
||||
"content": "class DataProcessor:\n def process(self, data):\n return [x * 2 for x in data]",
|
||||
"metadata": {
|
||||
"filepath": "/project/src/utils.py",
|
||||
"line_count": 3,
|
||||
"start_line_no": 5,
|
||||
"end_line_no": 7,
|
||||
"node_count": 2,
|
||||
},
|
||||
},
|
||||
]
|
||||
mock_builder.chunkify.return_value = astchunk_chunks
|
||||
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# CRITICAL: These assertions will FAIL with current list[str] return type
|
||||
assert len(chunks) == 2, "Should return 2 chunks"
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
# Structure assertions - WILL FAIL: current code returns strings
|
||||
assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
|
||||
assert "text" in chunk, f"Chunk {i} must have 'text' key"
|
||||
assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
|
||||
assert isinstance(chunk["metadata"], dict), f"Chunk {i} metadata should be dict"
|
||||
|
||||
# Document metadata preservation - WILL FAIL
|
||||
metadata = chunk["metadata"]
|
||||
assert "file_path" in metadata, f"Chunk {i} should preserve file_path"
|
||||
assert metadata["file_path"] == "/project/src/utils.py", (
|
||||
f"Chunk {i} file_path incorrect"
|
||||
)
|
||||
|
||||
assert "file_name" in metadata, f"Chunk {i} should preserve file_name"
|
||||
assert metadata["file_name"] == "utils.py", f"Chunk {i} file_name incorrect"
|
||||
|
||||
assert "creation_date" in metadata, f"Chunk {i} should preserve creation_date"
|
||||
assert metadata["creation_date"] == "2024-01-15T10:30:00", (
|
||||
f"Chunk {i} creation_date incorrect"
|
||||
)
|
||||
|
||||
assert "last_modified_date" in metadata, f"Chunk {i} should preserve last_modified_date"
|
||||
assert metadata["last_modified_date"] == "2024-10-31T15:45:00", (
|
||||
f"Chunk {i} last_modified_date incorrect"
|
||||
)
|
||||
|
||||
# Verify metadata is consistent across chunks from same document
|
||||
assert chunks[0]["metadata"]["file_path"] == chunks[1]["metadata"]["file_path"], (
|
||||
"All chunks from same document should have same file_path"
|
||||
)
|
||||
|
||||
# Verify text content is present and not stringified
|
||||
assert "def calculate_sum" in chunks[0]["text"]
|
||||
assert "class DataProcessor" in chunks[1]["text"]
|
||||
|
||||
def test_ast_chunks_include_astchunk_metadata(self):
|
||||
"""Test that astchunk-specific metadata is merged into chunk metadata.
|
||||
|
||||
This test verifies that astchunk's metadata (line_count, start_line_no,
|
||||
end_line_no, node_count) is merged with document metadata.
|
||||
|
||||
This will FAIL because current code returns list[str], not list[dict].
|
||||
"""
|
||||
python_code = '''
|
||||
def function_one():
|
||||
"""First function."""
|
||||
x = 1
|
||||
y = 2
|
||||
return x + y
|
||||
|
||||
def function_two():
|
||||
"""Second function."""
|
||||
return 42
|
||||
'''
|
||||
doc = MockDocument(
|
||||
python_code,
|
||||
file_path="/test/code.py",
|
||||
metadata={
|
||||
"language": "python",
|
||||
"file_path": "/test/code.py",
|
||||
"file_name": "code.py",
|
||||
},
|
||||
)
|
||||
|
||||
# Mock astchunk with detailed metadata
|
||||
mock_builder = Mock()
|
||||
astchunk_chunks = [
|
||||
{
|
||||
"content": "def function_one():\n x = 1\n y = 2\n return x + y",
|
||||
"metadata": {
|
||||
"filepath": "/test/code.py",
|
||||
"line_count": 4,
|
||||
"start_line_no": 1,
|
||||
"end_line_no": 4,
|
||||
"node_count": 5, # function, assignments, return
|
||||
},
|
||||
},
|
||||
{
|
||||
"content": "def function_two():\n return 42",
|
||||
"metadata": {
|
||||
"filepath": "/test/code.py",
|
||||
"line_count": 2,
|
||||
"start_line_no": 7,
|
||||
"end_line_no": 8,
|
||||
"node_count": 2, # function, return
|
||||
},
|
||||
},
|
||||
]
|
||||
mock_builder.chunkify.return_value = astchunk_chunks
|
||||
|
||||
mock_astchunk = Mock()
|
||||
mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
|
||||
|
||||
with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
|
||||
chunks = create_ast_chunks([doc])
|
||||
|
||||
# CRITICAL: These will FAIL with current list[str] return
|
||||
assert len(chunks) == 2
|
||||
|
||||
# First chunk - function_one
|
||||
chunk1 = chunks[0]
|
||||
assert isinstance(chunk1, dict), "Chunk should be dict"
|
||||
assert "metadata" in chunk1
|
||||
|
||||
metadata1 = chunk1["metadata"]
|
||||
|
||||
# Check astchunk metadata is present
|
||||
assert "line_count" in metadata1, "Should include astchunk line_count"
|
||||
assert metadata1["line_count"] == 4, "line_count should be 4"
|
||||
|
||||
assert "start_line_no" in metadata1, "Should include astchunk start_line_no"
|
||||
assert metadata1["start_line_no"] == 1, "start_line_no should be 1"
|
||||
|
||||
assert "end_line_no" in metadata1, "Should include astchunk end_line_no"
|
||||
assert metadata1["end_line_no"] == 4, "end_line_no should be 4"
|
||||
|
||||
assert "node_count" in metadata1, "Should include astchunk node_count"
|
||||
assert metadata1["node_count"] == 5, "node_count should be 5"
|
||||
|
||||
# Second chunk - function_two
|
||||
chunk2 = chunks[1]
|
||||
metadata2 = chunk2["metadata"]
|
||||
|
||||
assert metadata2["line_count"] == 2, "line_count should be 2"
|
||||
assert metadata2["start_line_no"] == 7, "start_line_no should be 7"
|
||||
assert metadata2["end_line_no"] == 8, "end_line_no should be 8"
|
||||
assert metadata2["node_count"] == 2, "node_count should be 2"
|
||||
|
||||
# Verify document metadata is ALSO present (merged, not replaced)
|
||||
assert metadata1["file_path"] == "/test/code.py"
|
||||
assert metadata1["file_name"] == "code.py"
|
||||
assert metadata2["file_path"] == "/test/code.py"
|
||||
assert metadata2["file_name"] == "code.py"
|
||||
|
||||
# Verify text content is correct
|
||||
assert "def function_one" in chunk1["text"]
|
||||
assert "def function_two" in chunk2["text"]
|
||||
|
||||
def test_traditional_chunks_as_dicts_helper(self):
|
||||
"""Test the helper function that wraps traditional chunks as dicts.
|
||||
|
||||
This test verifies that when create_traditional_chunks is called,
|
||||
its plain string chunks are wrapped into dict format with metadata.
|
||||
|
||||
This will FAIL because the helper function _traditional_chunks_as_dicts()
|
||||
doesn't exist yet, and create_traditional_chunks returns list[str].
|
||||
"""
|
||||
# Create documents with various metadata
|
||||
docs = [
|
||||
MockDocument(
|
||||
"This is the first paragraph of text. It contains multiple sentences. "
|
||||
"This should be split into chunks based on size.",
|
||||
file_path="/docs/readme.txt",
|
||||
metadata={
|
||||
"file_path": "/docs/readme.txt",
|
||||
"file_name": "readme.txt",
|
||||
"creation_date": "2024-01-01",
|
||||
},
|
||||
),
|
||||
MockDocument(
|
||||
"Second document with different metadata. It also has content that needs chunking.",
|
||||
file_path="/docs/guide.md",
|
||||
metadata={
|
||||
"file_path": "/docs/guide.md",
|
||||
"file_name": "guide.md",
|
||||
"last_modified_date": "2024-10-31",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
# Call create_traditional_chunks (which should now return list[dict])
|
||||
chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
|
||||
|
||||
# CRITICAL: Will FAIL - current code returns list[str]
|
||||
assert len(chunks) > 0, "Should return chunks"
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
# Structure assertions - WILL FAIL
|
||||
assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
|
||||
assert "text" in chunk, f"Chunk {i} must have 'text' key"
|
||||
assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
|
||||
|
||||
# Text should be non-empty
|
||||
assert len(chunk["text"].strip()) > 0, f"Chunk {i} text should be non-empty"
|
||||
|
||||
# Metadata should include document info
|
||||
metadata = chunk["metadata"]
|
||||
assert "file_path" in metadata, f"Chunk {i} should have file_path in metadata"
|
||||
assert "file_name" in metadata, f"Chunk {i} should have file_name in metadata"
|
||||
|
||||
# Verify metadata tracking works correctly
|
||||
# At least one chunk should be from readme.txt
|
||||
readme_chunks = [c for c in chunks if "readme.txt" in c["metadata"]["file_name"]]
|
||||
assert len(readme_chunks) > 0, "Should have chunks from readme.txt"
|
||||
|
||||
# At least one chunk should be from guide.md
|
||||
guide_chunks = [c for c in chunks if "guide.md" in c["metadata"]["file_name"]]
|
||||
assert len(guide_chunks) > 0, "Should have chunks from guide.md"
|
||||
|
||||
# Verify creation_date is preserved for readme chunks
|
||||
for chunk in readme_chunks:
|
||||
assert chunk["metadata"].get("creation_date") == "2024-01-01", (
|
||||
"readme.txt chunks should preserve creation_date"
|
||||
)
|
||||
|
||||
# Verify last_modified_date is preserved for guide chunks
|
||||
for chunk in guide_chunks:
|
||||
assert chunk["metadata"].get("last_modified_date") == "2024-10-31", (
|
||||
"guide.md chunks should preserve last_modified_date"
|
||||
)
|
||||
|
||||
# Verify text content is present
|
||||
all_text = " ".join([c["text"] for c in chunks])
|
||||
assert "first paragraph" in all_text
|
||||
assert "Second document" in all_text
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
"""Test error handling and edge cases."""
|
||||
|
||||
|
||||
268
tests/test_token_truncation.py
Normal file
268
tests/test_token_truncation.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Unit tests for token-aware truncation functionality.
|
||||
|
||||
This test suite defines the contract for token truncation functions that prevent
|
||||
500 errors from Ollama when text exceeds model token limits. These tests verify:
|
||||
|
||||
1. Model token limit retrieval (known and unknown models)
|
||||
2. Text truncation behavior for single and multiple texts
|
||||
3. Token counting and truncation accuracy using tiktoken
|
||||
|
||||
All tests are written in Red Phase - they should FAIL initially because the
|
||||
implementation does not exist yet.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tiktoken
|
||||
from leann.embedding_compute import (
|
||||
EMBEDDING_MODEL_LIMITS,
|
||||
get_model_token_limit,
|
||||
truncate_to_token_limit,
|
||||
)
|
||||
|
||||
|
||||
class TestModelTokenLimits:
|
||||
"""Tests for retrieving model-specific token limits."""
|
||||
|
||||
def test_get_model_token_limit_known_model(self):
|
||||
"""Verify correct token limit is returned for known models.
|
||||
|
||||
Known models should return their specific token limits from
|
||||
EMBEDDING_MODEL_LIMITS dictionary.
|
||||
"""
|
||||
# Test nomic-embed-text (2048 tokens)
|
||||
limit = get_model_token_limit("nomic-embed-text")
|
||||
assert limit == 2048, "nomic-embed-text should have 2048 token limit"
|
||||
|
||||
# Test nomic-embed-text-v1.5 (2048 tokens)
|
||||
limit = get_model_token_limit("nomic-embed-text-v1.5")
|
||||
assert limit == 2048, "nomic-embed-text-v1.5 should have 2048 token limit"
|
||||
|
||||
# Test nomic-embed-text-v2 (512 tokens)
|
||||
limit = get_model_token_limit("nomic-embed-text-v2")
|
||||
assert limit == 512, "nomic-embed-text-v2 should have 512 token limit"
|
||||
|
||||
# Test OpenAI models (8192 tokens)
|
||||
limit = get_model_token_limit("text-embedding-3-small")
|
||||
assert limit == 8192, "text-embedding-3-small should have 8192 token limit"
|
||||
|
||||
def test_get_model_token_limit_unknown_model(self):
|
||||
"""Verify default token limit is returned for unknown models.
|
||||
|
||||
Unknown models should return the default limit (2048) to allow
|
||||
operation with reasonable safety margin.
|
||||
"""
|
||||
# Test with completely unknown model
|
||||
limit = get_model_token_limit("unknown-model-xyz")
|
||||
assert limit == 2048, "Unknown models should return default 2048"
|
||||
|
||||
# Test with empty string
|
||||
limit = get_model_token_limit("")
|
||||
assert limit == 2048, "Empty model name should return default 2048"
|
||||
|
||||
def test_get_model_token_limit_custom_default(self):
|
||||
"""Verify custom default can be specified for unknown models.
|
||||
|
||||
Allow callers to specify their own default token limit when
|
||||
model is not in the known models dictionary.
|
||||
"""
|
||||
limit = get_model_token_limit("unknown-model", default=4096)
|
||||
assert limit == 4096, "Should return custom default for unknown models"
|
||||
|
||||
# Known model should ignore custom default
|
||||
limit = get_model_token_limit("nomic-embed-text", default=4096)
|
||||
assert limit == 2048, "Known model should ignore custom default"
|
||||
|
||||
def test_embedding_model_limits_dictionary_exists(self):
|
||||
"""Verify EMBEDDING_MODEL_LIMITS dictionary contains expected models.
|
||||
|
||||
The dictionary should be importable and contain at least the
|
||||
known nomic models with correct token limits.
|
||||
"""
|
||||
assert isinstance(EMBEDDING_MODEL_LIMITS, dict), "Should be a dictionary"
|
||||
assert "nomic-embed-text" in EMBEDDING_MODEL_LIMITS, "Should contain nomic-embed-text"
|
||||
assert "nomic-embed-text-v1.5" in EMBEDDING_MODEL_LIMITS, (
|
||||
"Should contain nomic-embed-text-v1.5"
|
||||
)
|
||||
assert EMBEDDING_MODEL_LIMITS["nomic-embed-text"] == 2048
|
||||
assert EMBEDDING_MODEL_LIMITS["nomic-embed-text-v1.5"] == 2048
|
||||
assert EMBEDDING_MODEL_LIMITS["nomic-embed-text-v2"] == 512
|
||||
# OpenAI models
|
||||
assert EMBEDDING_MODEL_LIMITS["text-embedding-3-small"] == 8192
|
||||
|
||||
|
||||
class TestTokenTruncation:
|
||||
"""Tests for truncating texts to token limits."""
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer(self):
|
||||
"""Provide tiktoken tokenizer for token counting verification."""
|
||||
return tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def test_truncate_single_text_under_limit(self, tokenizer):
|
||||
"""Verify text under token limit remains unchanged.
|
||||
|
||||
When text is already within the token limit, it should be
|
||||
returned unchanged with no truncation.
|
||||
"""
|
||||
text = "This is a short text that is well under the token limit."
|
||||
token_count = len(tokenizer.encode(text))
|
||||
assert token_count < 100, f"Test setup: text should be short (has {token_count} tokens)"
|
||||
|
||||
# Truncate with generous limit
|
||||
result = truncate_to_token_limit([text], token_limit=512)
|
||||
|
||||
assert len(result) == 1, "Should return same number of texts"
|
||||
assert result[0] == text, "Text under limit should be unchanged"
|
||||
|
||||
def test_truncate_single_text_over_limit(self, tokenizer):
|
||||
"""Verify text over token limit is truncated correctly.
|
||||
|
||||
When text exceeds the token limit, it should be truncated to
|
||||
fit within the limit while maintaining valid token boundaries.
|
||||
"""
|
||||
# Create a text that definitely exceeds limit
|
||||
text = "word " * 200 # ~200 tokens (each "word " is typically 1-2 tokens)
|
||||
original_token_count = len(tokenizer.encode(text))
|
||||
assert original_token_count > 50, (
|
||||
f"Test setup: text should be long (has {original_token_count} tokens)"
|
||||
)
|
||||
|
||||
# Truncate to 50 tokens
|
||||
result = truncate_to_token_limit([text], token_limit=50)
|
||||
|
||||
assert len(result) == 1, "Should return same number of texts"
|
||||
assert result[0] != text, "Text over limit should be truncated"
|
||||
assert len(result[0]) < len(text), "Truncated text should be shorter"
|
||||
|
||||
# Verify truncated text is within token limit
|
||||
truncated_token_count = len(tokenizer.encode(result[0]))
|
||||
assert truncated_token_count <= 50, (
|
||||
f"Truncated text should be ≤50 tokens, got {truncated_token_count}"
|
||||
)
|
||||
|
||||
def test_truncate_multiple_texts_mixed_lengths(self, tokenizer):
|
||||
"""Verify multiple texts with mixed lengths are handled correctly.
|
||||
|
||||
When processing multiple texts:
|
||||
- Texts under limit should remain unchanged
|
||||
- Texts over limit should be truncated independently
|
||||
- Output list should maintain same order and length
|
||||
"""
|
||||
texts = [
|
||||
"Short text.", # Under limit
|
||||
"word " * 200, # Over limit
|
||||
"Another short one.", # Under limit
|
||||
"token " * 150, # Over limit
|
||||
]
|
||||
|
||||
# Verify test setup
|
||||
for i, text in enumerate(texts):
|
||||
token_count = len(tokenizer.encode(text))
|
||||
if i in [1, 3]:
|
||||
assert token_count > 50, f"Text {i} should be over limit (has {token_count} tokens)"
|
||||
else:
|
||||
assert token_count < 50, (
|
||||
f"Text {i} should be under limit (has {token_count} tokens)"
|
||||
)
|
||||
|
||||
# Truncate with 50 token limit
|
||||
result = truncate_to_token_limit(texts, token_limit=50)
|
||||
|
||||
assert len(result) == len(texts), "Should return same number of texts"
|
||||
|
||||
# Verify each text individually
|
||||
for i, (original, truncated) in enumerate(zip(texts, result)):
|
||||
token_count = len(tokenizer.encode(truncated))
|
||||
assert token_count <= 50, f"Text {i} should be ≤50 tokens, got {token_count}"
|
||||
|
||||
# Short texts should be unchanged
|
||||
if i in [0, 2]:
|
||||
assert truncated == original, f"Short text {i} should be unchanged"
|
||||
# Long texts should be truncated
|
||||
else:
|
||||
assert len(truncated) < len(original), f"Long text {i} should be truncated"
|
||||
|
||||
def test_truncate_empty_list(self):
|
||||
"""Verify empty input list returns empty output list.
|
||||
|
||||
Edge case: empty list should return empty list without errors.
|
||||
"""
|
||||
result = truncate_to_token_limit([], token_limit=512)
|
||||
assert result == [], "Empty input should return empty output"
|
||||
|
||||
def test_truncate_preserves_order(self, tokenizer):
|
||||
"""Verify truncation preserves original text order.
|
||||
|
||||
Output list should maintain the same order as input list,
|
||||
regardless of which texts were truncated.
|
||||
"""
|
||||
texts = [
|
||||
"First text " * 50, # Will be truncated
|
||||
"Second text.", # Won't be truncated
|
||||
"Third text " * 50, # Will be truncated
|
||||
]
|
||||
|
||||
result = truncate_to_token_limit(texts, token_limit=20)
|
||||
|
||||
assert len(result) == 3, "Should preserve list length"
|
||||
# Check that order is maintained by looking for distinctive words
|
||||
assert "First" in result[0], "First text should remain in first position"
|
||||
assert "Second" in result[1], "Second text should remain in second position"
|
||||
assert "Third" in result[2], "Third text should remain in third position"
|
||||
|
||||
def test_truncate_extremely_long_text(self, tokenizer):
|
||||
"""Verify extremely long texts are truncated efficiently.
|
||||
|
||||
Test with text that far exceeds token limit to ensure
|
||||
truncation handles extreme cases without performance issues.
|
||||
"""
|
||||
# Create very long text (simulate real-world scenario)
|
||||
text = "token " * 5000 # ~5000+ tokens
|
||||
original_token_count = len(tokenizer.encode(text))
|
||||
assert original_token_count > 1000, "Test setup: text should be very long"
|
||||
|
||||
# Truncate to small limit
|
||||
result = truncate_to_token_limit([text], token_limit=100)
|
||||
|
||||
assert len(result) == 1
|
||||
truncated_token_count = len(tokenizer.encode(result[0]))
|
||||
assert truncated_token_count <= 100, (
|
||||
f"Should truncate to ≤100 tokens, got {truncated_token_count}"
|
||||
)
|
||||
assert len(result[0]) < len(text) // 10, "Should significantly reduce text length"
|
||||
|
||||
def test_truncate_exact_token_limit(self, tokenizer):
|
||||
"""Verify text at exactly token limit is handled correctly.
|
||||
|
||||
Edge case: text with exactly the token limit should either
|
||||
remain unchanged or be safely truncated by 1 token.
|
||||
"""
|
||||
# Create text with approximately 50 tokens
|
||||
# We'll adjust to get exactly 50
|
||||
target_tokens = 50
|
||||
text = "word " * 50
|
||||
tokens = tokenizer.encode(text)
|
||||
|
||||
# Adjust to get exactly target_tokens
|
||||
if len(tokens) > target_tokens:
|
||||
tokens = tokens[:target_tokens]
|
||||
text = tokenizer.decode(tokens)
|
||||
elif len(tokens) < target_tokens:
|
||||
# Add more words
|
||||
while len(tokenizer.encode(text)) < target_tokens:
|
||||
text += "word "
|
||||
tokens = tokenizer.encode(text)[:target_tokens]
|
||||
text = tokenizer.decode(tokens)
|
||||
|
||||
# Verify we have exactly target_tokens
|
||||
assert len(tokenizer.encode(text)) == target_tokens, (
|
||||
"Test setup: should have exactly 50 tokens"
|
||||
)
|
||||
|
||||
result = truncate_to_token_limit([text], token_limit=target_tokens)
|
||||
|
||||
assert len(result) == 1
|
||||
result_tokens = len(tokenizer.encode(result[0]))
|
||||
assert result_tokens <= target_tokens, (
|
||||
f"Should be ≤{target_tokens} tokens, got {result_tokens}"
|
||||
)
|
||||
Reference in New Issue
Block a user