fix: format colqwen_forward.py to pass pre-commit checks

reproduce docvqa results and add debug file
reproduce docvqa results
2025-12-03 09:06:29 +00:00 · 2025-12-03 08:54:55 +00:00 · 2025-11-14 10:22:42 +00:00 · 2025-11-14 07:31:24 +00:00 · 2025-11-10 21:13:17 +00:00 · 2025-11-09 02:34:53 +00:00
14 changed files with 7414 additions and 4257 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -91,7 +91,8 @@ packages/leann-backend-diskann/third_party/DiskANN/_deps/

 *.meta.json
 *.passages.json
-
+*.npy
+*.db
 batchtest.py
 tests/__pytest_cache__/
 tests/__pycache__/
--- a/apps/chunking/init.py
+++ b/apps/chunking/init.py
@@ -12,6 +12,7 @@ from pathlib import Path
 try:
    from leann.chunking_utils import (
        CODE_EXTENSIONS,
+        _traditional_chunks_as_dicts,
        create_ast_chunks,
        create_text_chunks,
        create_traditional_chunks,
@@ -25,6 +26,7 @@ except Exception:  # pragma: no cover - best-effort fallback for dev environment
        sys.path.insert(0, str(leann_src))
        from leann.chunking_utils import (
            CODE_EXTENSIONS,
+            _traditional_chunks_as_dicts,
            create_ast_chunks,
            create_text_chunks,
            create_traditional_chunks,
@@ -36,6 +38,7 @@ except Exception:  # pragma: no cover - best-effort fallback for dev environment

 __all__ = [
    "CODE_EXTENSIONS",
+    "_traditional_chunks_as_dicts",
    "create_ast_chunks",
    "create_text_chunks",
    "create_traditional_chunks",
--- a/apps/multimodal/vision-based-pdf-multi-vector/colqwen_forward.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/colqwen_forward.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Simple test script to test colqwen2 forward pass with a single image."""
+
+import os
+import sys
+from pathlib import Path
+
+# Add the current directory to path to import leann_multi_vector
+sys.path.insert(0, str(Path(__file__).parent))
+
+import torch
+from leann_multi_vector import _embed_images, _ensure_repo_paths_importable, _load_colvision
+from PIL import Image
+
+# Ensure repo paths are importable
+_ensure_repo_paths_importable(__file__)
+
+# Set environment variable
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+def create_test_image():
+    """Create a simple test image."""
+    # Create a simple RGB image (800x600)
+    img = Image.new("RGB", (800, 600), color="white")
+    return img
+
+
+def load_test_image_from_file():
+    """Try to load an image from the indexes directory if available."""
+    # Try to find an existing image in the indexes directory
+    indexes_dir = Path(__file__).parent / "indexes"
+
+    # Look for images in common locations
+    possible_paths = [
+        indexes_dir / "vidore_fastplaid" / "images",
+        indexes_dir / "colvision_large.leann.images",
+        indexes_dir / "colvision.leann.images",
+    ]
+
+    for img_dir in possible_paths:
+        if img_dir.exists():
+            # Find first image file
+            for ext in [".png", ".jpg", ".jpeg"]:
+                for img_file in img_dir.glob(f"*{ext}"):
+                    print(f"Loading test image from: {img_file}")
+                    return Image.open(img_file)
+
+    return None
+
+
+def main():
+    print("=" * 60)
+    print("Testing ColQwen2 Forward Pass")
+    print("=" * 60)
+
+    # Step 1: Load or create test image
+    print("\n[Step 1] Loading test image...")
+    test_image = load_test_image_from_file()
+    if test_image is None:
+        print("No existing image found, creating a simple test image...")
+        test_image = create_test_image()
+    else:
+        print(f"✓ Loaded image: {test_image.size} ({test_image.mode})")
+
+    # Convert to RGB if needed
+    if test_image.mode != "RGB":
+        test_image = test_image.convert("RGB")
+        print(f"✓ Converted to RGB: {test_image.size}")
+
+    # Step 2: Load model
+    print("\n[Step 2] Loading ColQwen2 model...")
+    try:
+        model_name, model, processor, device_str, device, dtype = _load_colvision("colqwen2")
+        print(f"✓ Model loaded: {model_name}")
+        print(f"✓ Device: {device_str}, dtype: {dtype}")
+
+        # Print model info
+        if hasattr(model, "device"):
+            print(f"✓ Model device: {model.device}")
+        if hasattr(model, "dtype"):
+            print(f"✓ Model dtype: {model.dtype}")
+
+    except Exception as e:
+        print(f"✗ Error loading model: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return
+
+    # Step 3: Test forward pass
+    print("\n[Step 3] Running forward pass...")
+    try:
+        # Use the _embed_images function which handles batching and forward pass
+        images = [test_image]
+        print(f"Processing {len(images)} image(s)...")
+
+        doc_vecs = _embed_images(model, processor, images)
+
+        print("✓ Forward pass completed!")
+        print(f"✓ Number of embeddings: {len(doc_vecs)}")
+
+        if len(doc_vecs) > 0:
+            emb = doc_vecs[0]
+            print(f"✓ Embedding shape: {emb.shape}")
+            print(f"✓ Embedding dtype: {emb.dtype}")
+            print("✓ Embedding stats:")
+            print(f"    - Min: {emb.min().item():.4f}")
+            print(f"    - Max: {emb.max().item():.4f}")
+            print(f"    - Mean: {emb.mean().item():.4f}")
+            print(f"    - Std: {emb.std().item():.4f}")
+
+            # Check for NaN or Inf
+            if torch.isnan(emb).any():
+                print("⚠ Warning: Embedding contains NaN values!")
+            if torch.isinf(emb).any():
+                print("⚠ Warning: Embedding contains Inf values!")
+
+    except Exception as e:
+        print(f"✗ Error during forward pass: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return
+
+    print("\n" + "=" * 60)
+    print("Test completed successfully!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py
--- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py
@@ -1,42 +1,75 @@
 ## Jupyter-style notebook script
 # %%
 # uv pip install matplotlib qwen_vl_utils
+import argparse
+import faulthandler
 import os
-import json
-import re
-import sys
-from pathlib import Path
-from typing import Any, Optional, cast
+import time
+from typing import Any, Optional

+import numpy as np
 from PIL import Image
 from tqdm import tqdm

+# Enable faulthandler to get stack trace on segfault
+faulthandler.enable()

-def _ensure_repo_paths_importable(current_file: str) -> None:
-    """Make local leann packages importable without installing (mirrors multi-vector-leann.py)."""
-    _repo_root = Path(current_file).resolve().parents[3]
-    _leann_core_src = _repo_root / "packages" / "leann-core" / "src"
-    _leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
-    if str(_leann_core_src) not in sys.path:
-        sys.path.append(str(_leann_core_src))
-    if str(_leann_hnsw_pkg) not in sys.path:
-        sys.path.append(str(_leann_hnsw_pkg))

+from leann_multi_vector import (  # utility functions/classes
+    _ensure_repo_paths_importable,
+    _load_images_from_dir,
+    _maybe_convert_pdf_to_images,
+    _load_colvision,
+    _embed_images,
+    _embed_queries,
+    _build_index,
+    _load_retriever_if_index_exists,
+    _generate_similarity_map,
+    _build_fast_plaid_index,
+    _load_fast_plaid_index_if_exists,
+    _search_fast_plaid,
+    _get_fast_plaid_image,
+    _get_fast_plaid_metadata,
+    QwenVL,
+)

 _ensure_repo_paths_importable(__file__)

-from leann_multi_vector import LeannMultiVector  # noqa: E402
-
 # %%
 # Config
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-QUERY = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
+QUERY = "The paper talk about the latent video generative model and data curation in the related work part?"
 MODEL: str = "colqwen2"  # "colpali" or "colqwen2"

 # Data source: set to True to use the Hugging Face dataset example (recommended)
 USE_HF_DATASET: bool = True
+# Single dataset name (used when DATASET_NAMES is None)
 DATASET_NAME: str = "weaviate/arXiv-AI-papers-multi-vector"
-DATASET_SPLIT: str = "train"
+# Multiple datasets to combine (if provided, DATASET_NAME is ignored)
+# Can be:
+# - List of strings: ["dataset1", "dataset2"]
+# - List of tuples: [("dataset1", "config1"), ("dataset2", None)]  # None = no config needed
+# - Mixed: ["dataset1", ("dataset2", "config2")]
+#
+# Some potential datasets with images (may need IMAGE_FIELD_NAME adjustment):
+# - "weaviate/arXiv-AI-papers-multi-vector" (current, has "page_image" field)
+# - ("lmms-lab/DocVQA", "DocVQA") (has "image" field, document images, needs config)
+# - ("lmms-lab/DocVQA", "InfographicVQA") (has "image" field, infographic images)
+# - "pixparse/arxiv-papers" (if available, arXiv papers)
+# - "allenai/ai2d" (AI2D diagram dataset, has "image" field)
+# - "huggingface/document-images" (if available)
+# Note: Check dataset structure first - some may need IMAGE_FIELD_NAME specified
+# DATASET_NAMES: Optional[list[str | tuple[str, Optional[str]]]] = None
+DATASET_NAMES = [
+    "weaviate/arXiv-AI-papers-multi-vector",
+    ("lmms-lab/DocVQA", "DocVQA"),  # Specify config name for datasets with multiple configs
+]
+# Load multiple splits to get more data (e.g., ["train", "test", "validation"])
+# Set to None to try loading all available splits automatically
+DATASET_SPLITS: Optional[list[str]] = ["train", "test"]  # None = auto-detect all splits
+# Image field name in the dataset (auto-detect if None)
+# Common names: "page_image", "image", "images", "img"
+IMAGE_FIELD_NAME: Optional[str] = None  # None = auto-detect
 MAX_DOCS: Optional[int] = None  # limit number of pages to index; None = all

 # Local pages (used when USE_HF_DATASET == False)
@@ -44,10 +77,13 @@ PDF: Optional[str] = None  # e.g., "./pdfs/2004.12832v2.pdf"
 PAGES_DIR: str = "./pages"

 # Index + retrieval settings
-INDEX_PATH: str = "./indexes/colvision.leann"
-TOPK: int = 1
+# Use a different index path for larger dataset to avoid overwriting existing index
+INDEX_PATH: str = "./indexes/colvision_large.leann"
+# Fast-Plaid index settings (alternative to LEANN index)
+# These are now command-line arguments (see CLI overrides section)
+TOPK: int = 3
 FIRST_STAGE_K: int = 500
-REBUILD_INDEX: bool = False
+REBUILD_INDEX: bool = True

 # Artifacts
 SAVE_TOP_IMAGE: Optional[str] = "./figures/retrieved_page.png"
@@ -55,369 +91,517 @@ SIMILARITY_MAP: bool = True
 SIM_TOKEN_IDX: int = 13  # -1 means auto-select the most salient token
 SIM_OUTPUT: str = "./figures/similarity_map.png"
 ANSWER: bool = True
-MAX_NEW_TOKENS: int = 128
+MAX_NEW_TOKENS: int = 1024


 # %%
-# Helpers
-def _natural_sort_key(name: str) -> int:
-    m = re.search(r"\d+", name)
-    return int(m.group()) if m else 0
-
-
-def _load_images_from_dir(pages_dir: str) -> tuple[list[str], list[Image.Image]]:
-    filenames = [n for n in os.listdir(pages_dir) if n.lower().endswith((".png", ".jpg", ".jpeg"))]
-    filenames = sorted(filenames, key=_natural_sort_key)
-    filepaths = [os.path.join(pages_dir, n) for n in filenames]
-    images = [Image.open(p) for p in filepaths]
-    return filepaths, images
-
-
-def _maybe_convert_pdf_to_images(pdf_path: Optional[str], pages_dir: str, dpi: int = 200) -> None:
-    if not pdf_path:
-        return
-    os.makedirs(pages_dir, exist_ok=True)
-    try:
-        from pdf2image import convert_from_path
-    except Exception as e:
-        raise RuntimeError(
-            "pdf2image is required to convert PDF to images. Install via pip install pdf2image"
-        ) from e
-    images = convert_from_path(pdf_path, dpi=dpi)
-    for i, image in enumerate(images):
-        image.save(os.path.join(pages_dir, f"page_{i + 1}.png"), "PNG")
-
-
-def _select_device_and_dtype():
-    import torch
-    from colpali_engine.utils.torch_utils import get_torch_device
-
-    device_str = (
-        "cuda"
-        if torch.cuda.is_available()
-        else (
-            "mps"
-            if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
-            else "cpu"
-        )
-    )
-    device = get_torch_device(device_str)
-    # Stable dtype selection to avoid NaNs:
-    # - CUDA: prefer bfloat16 if supported, else float16
-    # - MPS: use float32 (fp16 on MPS can produce NaNs in some ops)
-    # - CPU: float32
-    if device_str == "cuda":
-        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        try:
-            torch.backends.cuda.matmul.allow_tf32 = True  # Better stability/perf on Ampere+
-        except Exception:
-            pass
-    elif device_str == "mps":
-        dtype = torch.float32
-    else:
-        dtype = torch.float32
-    return device_str, device, dtype
-
-
-def _load_colvision(model_choice: str):
-    import torch
-    from colpali_engine.models import ColPali, ColQwen2, ColQwen2Processor
-    from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
-    from transformers.utils.import_utils import is_flash_attn_2_available
-
-    device_str, device, dtype = _select_device_and_dtype()
-
-    if model_choice == "colqwen2":
-        model_name = "vidore/colqwen2-v1.0"
-        # On CPU/MPS we must avoid flash-attn and stay eager; on CUDA prefer flash-attn if available
-        attn_implementation = (
-            "flash_attention_2"
-            if (device_str == "cuda" and is_flash_attn_2_available())
-            else "eager"
-        )
-        model = ColQwen2.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16,
-            device_map=device,
-            attn_implementation=attn_implementation,
-        ).eval()
-        processor = ColQwen2Processor.from_pretrained(model_name)
-    else:
-        model_name = "vidore/colpali-v1.2"
-        model = ColPali.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16,
-            device_map=device,
-        ).eval()
-        processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
-
-    return model_name, model, processor, device_str, device, dtype
-
-
-def _embed_images(model, processor, images: list[Image.Image]) -> list[Any]:
-    import torch
-    from colpali_engine.utils.torch_utils import ListDataset
-    from torch.utils.data import DataLoader
-
-    # Ensure deterministic eval and autocast for stability
-    model.eval()
-
-    dataloader = DataLoader(
-        dataset=ListDataset[Image.Image](images),
-        batch_size=1,
-        shuffle=False,
-        collate_fn=lambda x: processor.process_images(x),
-    )
-
-    doc_vecs: list[Any] = []
-    for batch_doc in tqdm(dataloader, desc="Embedding images"):
-        with torch.no_grad():
-            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
-            # autocast on CUDA for bf16/fp16; on CPU/MPS stay in fp32
-            if model.device.type == "cuda":
-                with torch.autocast(
-                    device_type="cuda",
-                    dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
-                ):
-                    embeddings_doc = model(**batch_doc)
-            else:
-                embeddings_doc = model(**batch_doc)
-        doc_vecs.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
-    return doc_vecs
-
-
-def _embed_queries(model, processor, queries: list[str]) -> list[Any]:
-    import torch
-    from colpali_engine.utils.torch_utils import ListDataset
-    from torch.utils.data import DataLoader
-
-    model.eval()
-
-    dataloader = DataLoader(
-        dataset=ListDataset[str](queries),
-        batch_size=1,
-        shuffle=False,
-        collate_fn=lambda x: processor.process_queries(x),
-    )
-
-    q_vecs: list[Any] = []
-    for batch_query in tqdm(dataloader, desc="Embedding queries"):
-        with torch.no_grad():
-            batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
-            if model.device.type == "cuda":
-                with torch.autocast(
-                    device_type="cuda",
-                    dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
-                ):
-                    embeddings_query = model(**batch_query)
-            else:
-                embeddings_query = model(**batch_query)
-        q_vecs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
-    return q_vecs
-
-
-def _build_index(index_path: str, doc_vecs: list[Any], filepaths: list[str]) -> LeannMultiVector:
-    dim = int(doc_vecs[0].shape[-1])
-    retriever = LeannMultiVector(index_path=index_path, dim=dim)
-    retriever.create_collection()
-    for i, vec in enumerate(doc_vecs):
-        data = {
-            "colbert_vecs": vec.float().numpy(),
-            "doc_id": i,
-            "filepath": filepaths[i],
-        }
-        retriever.insert(data)
-    retriever.create_index()
-    return retriever
-
-
-def _load_retriever_if_index_exists(index_path: str) -> Optional[LeannMultiVector]:
-    index_base = Path(index_path)
-    # Rough heuristic: index dir exists AND meta+labels files exist
-    meta = index_base.parent / f"{index_base.name}.meta.json"
-    labels = index_base.parent / f"{index_base.name}.labels.json"
-    if index_base.exists() and meta.exists() and labels.exists():
-        try:
-            with open(meta, "r", encoding="utf-8") as f:
-                meta_json = json.load(f)
-            dim = int(meta_json.get("dimensions", 128))
-        except Exception:
-            dim = 128
-        return LeannMultiVector(index_path=index_path, dim=dim)
-    return None
-
-
-def _generate_similarity_map(
-    model,
-    processor,
-    image: Image.Image,
-    query: str,
-    token_idx: Optional[int] = None,
-    output_path: Optional[str] = None,
-) -> tuple[int, float]:
-    import torch
-    from colpali_engine.interpretability import (
-        get_similarity_maps_from_embeddings,
-        plot_similarity_map,
-    )
-
-    batch_images = processor.process_images([image]).to(model.device)
-    batch_queries = processor.process_queries([query]).to(model.device)
-
-    with torch.no_grad():
-        image_embeddings = model.forward(**batch_images)
-        query_embeddings = model.forward(**batch_queries)
-
-    n_patches = processor.get_n_patches(
-        image_size=image.size,
-        spatial_merge_size=getattr(model, "spatial_merge_size", None),
-    )
-    image_mask = processor.get_image_mask(batch_images)
-
-    batched_similarity_maps = get_similarity_maps_from_embeddings(
-        image_embeddings=image_embeddings,
-        query_embeddings=query_embeddings,
-        n_patches=n_patches,
-        image_mask=image_mask,
-    )
-
-    similarity_maps = batched_similarity_maps[0]
-
-    # Determine token index if not provided: choose the token with highest max score
-    if token_idx is None:
-        per_token_max = similarity_maps.view(similarity_maps.shape[0], -1).max(dim=1).values
-        token_idx = int(per_token_max.argmax().item())
-
-    max_sim_score = similarity_maps[token_idx, :, :].max().item()
-
-    if output_path:
-        import matplotlib.pyplot as plt
-
-        fig, ax = plot_similarity_map(
-            image=image,
-            similarity_map=similarity_maps[token_idx],
-            figsize=(14, 14),
-            show_colorbar=False,
-        )
-        ax.set_title(f"Token #{token_idx}. MaxSim score: {max_sim_score:.2f}", fontsize=12)
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        plt.savefig(output_path, bbox_inches="tight")
-        plt.close(fig)
-
-    return token_idx, float(max_sim_score)
-
-
-class QwenVL:
-    def __init__(self, device: str):
-        from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-        from transformers.utils.import_utils import is_flash_attn_2_available
-
-        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "eager"
-        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2.5-VL-3B-Instruct",
-            torch_dtype="auto",
-            device_map=device,
-            attn_implementation=attn_implementation,
-        )
-
-        min_pixels = 256 * 28 * 28
-        max_pixels = 1280 * 28 * 28
-        self.processor = AutoProcessor.from_pretrained(
-            "Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
-        )
-
-    def answer(self, query: str, images: list[Image.Image], max_new_tokens: int = 128) -> str:
-        import base64
-        from io import BytesIO
-
-        from qwen_vl_utils import process_vision_info
-
-        content = []
-        for img in images:
-            buffer = BytesIO()
-            img.save(buffer, format="jpeg")
-            img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
-            content.append({"type": "image", "image": f"data:image;base64,{img_base64}"})
-        content.append({"type": "text", "text": query})
-        messages = [{"role": "user", "content": content}]
-
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
-        )
-        inputs = inputs.to(self.model.device)
-
-        generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        return self.processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-
+# CLI overrides
+parser = argparse.ArgumentParser(description="Multi-vector LEANN similarity map demo")
+parser.add_argument(
+    "--search-method",
+    type=str,
+    choices=["ann", "exact", "exact-all"],
+    default="ann",
+    help="Which search method to use: 'ann' (fast ANN), 'exact' (ANN + exact rerank), or 'exact-all' (exact over all docs).",
+)
+parser.add_argument(
+    "--query",
+    type=str,
+    default=QUERY,
+    help=f"Query string to search for. Default: '{QUERY}'",
+)
+parser.add_argument(
+    "--use-fast-plaid",
+    action="store_true",
+    default=False,
+    help="Set to True to use fast-plaid instead of LEANN. Default: False",
+)
+parser.add_argument(
+    "--fast-plaid-index-path",
+    type=str,
+    default="./indexes/colvision_fastplaid",
+    help="Path to the Fast-Plaid index. Default: './indexes/colvision_fastplaid'",
+)
+parser.add_argument(
+    "--topk",
+    type=int,
+    default=TOPK,
+    help=f"Number of top results to retrieve. Default: {TOPK}",
+)
+cli_args, _unknown = parser.parse_known_args()
+SEARCH_METHOD: str = cli_args.search_method
+QUERY = cli_args.query  # Override QUERY with CLI argument if provided
+USE_FAST_PLAID: bool = cli_args.use_fast_plaid
+FAST_PLAID_INDEX_PATH: str = cli_args.fast_plaid_index_path
+TOPK: int = cli_args.topk  # Override TOPK with CLI argument if provided

 # %%

-# Step 1: Prepare data
-if USE_HF_DATASET:
-    from datasets import load_dataset
+# Step 1: Check if we can skip data loading (index already exists)
+retriever: Optional[Any] = None
+fast_plaid_index: Optional[Any] = None
+need_to_build_index = REBUILD_INDEX

-    dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
-    N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
-    filepaths: list[str] = []
-    images: list[Image.Image] = []
-    for i in tqdm(range(N), desc="Loading dataset", total=N ):
-        p = dataset[i]
-        # Compose a descriptive identifier for printing later
-        identifier = f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
-        print(identifier)
-        filepaths.append(identifier)
-        images.append(p["page_image"])  # PIL Image
+if USE_FAST_PLAID:
+    # Fast-Plaid index handling
+    if not REBUILD_INDEX:
+        try:
+            fast_plaid_index = _load_fast_plaid_index_if_exists(FAST_PLAID_INDEX_PATH)
+            if fast_plaid_index is not None:
+                print(f"✓ Fast-Plaid index found at {FAST_PLAID_INDEX_PATH}")
+                need_to_build_index = False
+            else:
+                print(f"Fast-Plaid index not found, will build new index")
+                need_to_build_index = True
+        except Exception as e:
+            # If loading fails (e.g., memory error, corrupted index), rebuild
+            print(f"Warning: Failed to load Fast-Plaid index: {e}")
+            print("Will rebuild the index...")
+            need_to_build_index = True
+            fast_plaid_index = None
+    else:
+        print(f"REBUILD_INDEX=True, will rebuild Fast-Plaid index")
+        need_to_build_index = True
 else:
-    _maybe_convert_pdf_to_images(PDF, PAGES_DIR)
-    filepaths, images = _load_images_from_dir(PAGES_DIR)
-    if not images:
-        raise RuntimeError(
-            f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
-        )
+    # Original LEANN index handling
+    if not REBUILD_INDEX:
+        retriever = _load_retriever_if_index_exists(INDEX_PATH)
+        if retriever is not None:
+            print(f"✓ Index loaded from {INDEX_PATH}")
+            print(f"✓ Images available at: {retriever._images_dir_path()}")
+            need_to_build_index = False
+        else:
+            print(f"Index not found, will build new index")
+            need_to_build_index = True
+    else:
+        print(f"REBUILD_INDEX=True, will rebuild index")
+        need_to_build_index = True
+
+# Step 2: Load data only if we need to build the index
+if need_to_build_index:
+    print("Loading dataset...")
+    if USE_HF_DATASET:
+        from datasets import load_dataset, concatenate_datasets, DatasetDict
+
+        # Determine which datasets to load
+        if DATASET_NAMES is not None:
+            dataset_names_to_load = DATASET_NAMES
+            print(f"Loading {len(dataset_names_to_load)} datasets: {dataset_names_to_load}")
+        else:
+            dataset_names_to_load = [DATASET_NAME]
+            print(f"Loading single dataset: {DATASET_NAME}")
+
+        # Load and combine datasets
+        all_datasets_to_concat = []
+
+        for dataset_entry in dataset_names_to_load:
+            # Handle both string and tuple formats
+            if isinstance(dataset_entry, tuple):
+                dataset_name, config_name = dataset_entry
+            else:
+                dataset_name = dataset_entry
+                config_name = None
+
+            print(f"\nProcessing dataset: {dataset_name}" + (f" (config: {config_name})" if config_name else ""))
+
+            # Load dataset to check available splits
+            # If config_name is provided, use it; otherwise try without config
+            try:
+                if config_name:
+                    dataset_dict = load_dataset(dataset_name, config_name)
+                else:
+                    dataset_dict = load_dataset(dataset_name)
+            except ValueError as e:
+                if "Config name is missing" in str(e):
+                    # Try to get available configs and suggest
+                    from datasets import get_dataset_config_names
+                    try:
+                        available_configs = get_dataset_config_names(dataset_name)
+                        raise ValueError(
+                            f"Dataset '{dataset_name}' requires a config name. "
+                            f"Available configs: {available_configs}. "
+                            f"Please specify as: ('{dataset_name}', 'config_name')"
+                        ) from e
+                    except Exception:
+                        raise ValueError(
+                            f"Dataset '{dataset_name}' requires a config name. "
+                            f"Please specify as: ('{dataset_name}', 'config_name')"
+                        ) from e
+                raise
+
+            # Determine which splits to load
+            if DATASET_SPLITS is None:
+                # Auto-detect: try to load all available splits
+                available_splits = list(dataset_dict.keys())
+                print(f"  Auto-detected splits: {available_splits}")
+                splits_to_load = available_splits
+            else:
+                splits_to_load = DATASET_SPLITS
+
+            # Load and concatenate multiple splits for this dataset
+            datasets_to_concat = []
+            for split in splits_to_load:
+                if split not in dataset_dict:
+                    print(f"  Warning: Split '{split}' not found in dataset. Available splits: {list(dataset_dict.keys())}")
+                    continue
+                split_dataset = dataset_dict[split]
+                print(f"  Loaded split '{split}': {len(split_dataset)} pages")
+                datasets_to_concat.append(split_dataset)
+
+            if not datasets_to_concat:
+                print(f"  Warning: No valid splits found for {dataset_name}. Skipping.")
+                continue
+
+            # Concatenate splits for this dataset
+            if len(datasets_to_concat) > 1:
+                combined_dataset = concatenate_datasets(datasets_to_concat)
+                print(f"  Concatenated {len(datasets_to_concat)} splits into {len(combined_dataset)} pages")
+            else:
+                combined_dataset = datasets_to_concat[0]
+
+            all_datasets_to_concat.append(combined_dataset)
+
+        if not all_datasets_to_concat:
+            raise RuntimeError("No valid datasets or splits found.")
+
+        # Concatenate all datasets
+        if len(all_datasets_to_concat) > 1:
+            dataset = concatenate_datasets(all_datasets_to_concat)
+            print(f"\nConcatenated {len(all_datasets_to_concat)} datasets into {len(dataset)} total pages")
+        else:
+            dataset = all_datasets_to_concat[0]
+
+        # Apply MAX_DOCS limit if specified
+        N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
+        if N < len(dataset):
+            print(f"Limiting to {N} pages (from {len(dataset)} total)")
+            dataset = dataset.select(range(N))
+
+        # Auto-detect image field name if not specified
+        if IMAGE_FIELD_NAME is None:
+            # Check multiple samples to find the most common image field
+            # (useful when datasets are merged and may have different field names)
+            possible_image_fields = ["page_image", "image", "images", "img", "page", "document_image"]
+            field_counts = {}
+
+            # Check first few samples to find image fields
+            num_samples_to_check = min(10, len(dataset))
+            for sample_idx in range(num_samples_to_check):
+                sample = dataset[sample_idx]
+                for field in possible_image_fields:
+                    if field in sample and sample[field] is not None:
+                        value = sample[field]
+                        if isinstance(value, Image.Image) or (hasattr(value, 'size') and hasattr(value, 'mode')):
+                            field_counts[field] = field_counts.get(field, 0) + 1
+
+            # Choose the most common field, or first found if tied
+            if field_counts:
+                image_field = max(field_counts.items(), key=lambda x: x[1])[0]
+                print(f"Auto-detected image field: '{image_field}' (found in {field_counts[image_field]}/{num_samples_to_check} samples)")
+            else:
+                # Fallback: check first sample only
+                sample = dataset[0]
+                image_field = None
+                for field in possible_image_fields:
+                    if field in sample:
+                        value = sample[field]
+                        if isinstance(value, Image.Image) or (hasattr(value, 'size') and hasattr(value, 'mode')):
+                            image_field = field
+                            break
+                if image_field is None:
+                    raise RuntimeError(
+                        f"Could not auto-detect image field. Available fields: {list(sample.keys())}. "
+                        f"Please specify IMAGE_FIELD_NAME manually."
+                    )
+                print(f"Auto-detected image field: '{image_field}'")
+        else:
+            image_field = IMAGE_FIELD_NAME
+            if image_field not in dataset[0]:
+                raise RuntimeError(
+                    f"Image field '{image_field}' not found. Available fields: {list(dataset[0].keys())}"
+                )
+
+        filepaths: list[str] = []
+        images: list[Image.Image] = []
+        for i in tqdm(range(len(dataset)), desc="Loading dataset", total=len(dataset)):
+            p = dataset[i]
+            # Try to compose a descriptive identifier
+            # Handle different dataset structures
+            identifier_parts = []
+
+            # Helper function to safely get field value
+            def safe_get(field_name, default=None):
+                if field_name in p and p[field_name] is not None:
+                    return p[field_name]
+                return default
+
+            # Try to get various identifier fields
+            if safe_get("paper_arxiv_id"):
+                identifier_parts.append(f"arXiv:{p['paper_arxiv_id']}")
+            if safe_get("paper_title"):
+                identifier_parts.append(f"title:{p['paper_title']}")
+            if safe_get("page_number") is not None:
+                try:
+                    identifier_parts.append(f"page:{int(p['page_number'])}")
+                except (ValueError, TypeError):
+                    # If conversion fails, use the raw value or skip
+                    if p['page_number']:
+                        identifier_parts.append(f"page:{p['page_number']}")
+            if safe_get("page_id"):
+                identifier_parts.append(f"id:{p['page_id']}")
+            elif safe_get("questionId"):
+                identifier_parts.append(f"qid:{p['questionId']}")
+            elif safe_get("docId"):
+                identifier_parts.append(f"docId:{p['docId']}")
+            elif safe_get("id"):
+                identifier_parts.append(f"id:{p['id']}")
+
+            # If no identifier parts found, create one from index
+            if identifier_parts:
+                identifier = "|".join(identifier_parts)
+            else:
+                # Create identifier from available fields or index
+                fallback_parts = []
+                # Try common fields that might exist
+                for field in ["ucsf_document_id", "docId", "questionId", "id"]:
+                    if safe_get(field):
+                        fallback_parts.append(f"{field}:{p[field]}")
+                        break
+                if fallback_parts:
+                    identifier = "|".join(fallback_parts) + f"|idx:{i}"
+                else:
+                    identifier = f"doc_{i}"
+
+            filepaths.append(identifier)
+
+            # Get image - try detected field first, then fallback to other common fields
+            img = None
+            if image_field in p and p[image_field] is not None:
+                img = p[image_field]
+            else:
+                # Fallback: try other common image field names
+                for fallback_field in ["image", "page_image", "images", "img"]:
+                    if fallback_field in p and p[fallback_field] is not None:
+                        img = p[fallback_field]
+                        break
+
+            if img is None:
+                raise RuntimeError(
+                    f"No image found for sample {i}. Available fields: {list(p.keys())}. "
+                    f"Expected field: {image_field}"
+                )
+
+            # Ensure it's a PIL Image
+            if not isinstance(img, Image.Image):
+                if hasattr(img, 'convert'):
+                    img = img.convert('RGB')
+                else:
+                    img = Image.fromarray(img) if hasattr(img, '__array__') else Image.open(img)
+            images.append(img)
+    else:
+        _maybe_convert_pdf_to_images(PDF, PAGES_DIR)
+        filepaths, images = _load_images_from_dir(PAGES_DIR)
+        if not images:
+            raise RuntimeError(
+                f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
+            )
+    print(f"Loaded {len(images)} images")
+
+    # Memory check before loading model
+    try:
+        import psutil
+        import torch
+        process = psutil.Process(os.getpid())
+        mem_info = process.memory_info()
+        print(f"Memory usage after loading images: {mem_info.rss / 1024 / 1024 / 1024:.2f} GB")
+        if torch.cuda.is_available():
+            print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+            print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+    except ImportError:
+        pass
+else:
+    print("Skipping dataset loading (using existing index)")
+    filepaths = []  # Not needed when using existing index
+    images = []  # Not needed when using existing index


 # %%
-# Step 2: Load model and processor
-model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
-print(f"Using model={model_name}, device={device_str}, dtype={dtype}")
+# Step 3: Load model and processor (only if we need to build index or perform search)
+print("Step 3: Loading model and processor...")
+print(f"  Model: {MODEL}")
+try:
+    import sys
+    print(f"  Python version: {sys.version}")
+    print(f"  Python executable: {sys.executable}")
+
+    model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
+    print(f"✓ Using model={model_name}, device={device_str}, dtype={dtype}")
+
+    # Memory check after loading model
+    try:
+        import psutil
+        import torch
+        process = psutil.Process(os.getpid())
+        mem_info = process.memory_info()
+        print(f"  Memory usage after loading model: {mem_info.rss / 1024 / 1024 / 1024:.2f} GB")
+        if torch.cuda.is_available():
+            print(f"  GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+            print(f"  GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+    except ImportError:
+        pass
+except Exception as e:
+    print(f"✗ Error loading model: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()
+    raise


 # %%

 # %%
-# Step 3: Build or load index
-retriever: Optional[LeannMultiVector] = None
-if not REBUILD_INDEX:
-    retriever = _load_retriever_if_index_exists(INDEX_PATH)
+# Step 4: Build index if needed
+if need_to_build_index:
+    print("Step 4: Building index...")
+    print(f"  Number of images: {len(images)}")
+    print(f"  Number of filepaths: {len(filepaths)}")

-if retriever is None:
-    doc_vecs = _embed_images(model, processor, images)
-    retriever = _build_index(INDEX_PATH, doc_vecs, filepaths)
+    try:
+        print("  Embedding images...")
+        doc_vecs = _embed_images(model, processor, images)
+        print(f"  Embedded {len(doc_vecs)} documents")
+        print(f"  First doc vec shape: {doc_vecs[0].shape if len(doc_vecs) > 0 else 'N/A'}")
+    except Exception as e:
+        print(f"Error embedding images: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+
+    if USE_FAST_PLAID:
+        # Build Fast-Plaid index
+        print("  Building Fast-Plaid index...")
+        try:
+            fast_plaid_index, build_secs = _build_fast_plaid_index(
+                FAST_PLAID_INDEX_PATH, doc_vecs, filepaths, images
+            )
+            from pathlib import Path
+            print(f"✓ Fast-Plaid index built in {build_secs:.3f}s")
+            print(f"✓ Index saved to: {FAST_PLAID_INDEX_PATH}")
+            print(f"✓ Images saved to: {Path(FAST_PLAID_INDEX_PATH) / 'images'}")
+        except Exception as e:
+            print(f"Error building Fast-Plaid index: {type(e).__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+        finally:
+            # Clear memory
+            print("  Clearing memory...")
+            del images, filepaths, doc_vecs
+    else:
+        # Build original LEANN index
+        try:
+            retriever = _build_index(INDEX_PATH, doc_vecs, filepaths, images)
+            print(f"✓ Index built and images saved to: {retriever._images_dir_path()}")
+        except Exception as e:
+            print(f"Error building LEANN index: {type(e).__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+        finally:
+            # Clear memory
+            print("  Clearing memory...")
+            del images, filepaths, doc_vecs
+
+# Note: Images are now stored separately, retriever/fast_plaid_index will reference them


 # %%
-# Step 4: Embed query and search
+# Step 5: Embed query and search
+_t0 = time.perf_counter()
 q_vec = _embed_queries(model, processor, [QUERY])[0]
-results = retriever.search(q_vec.float().numpy(), topk=TOPK, first_stage_k=FIRST_STAGE_K)
+query_embed_secs = time.perf_counter() - _t0
+
+print(f"[Search] Method: {SEARCH_METHOD}")
+print(f"[Timing] Query embedding: {query_embed_secs:.3f}s")
+
+# Run the selected search method and time it
+if USE_FAST_PLAID:
+    # Fast-Plaid search
+    if fast_plaid_index is None:
+        fast_plaid_index = _load_fast_plaid_index_if_exists(FAST_PLAID_INDEX_PATH)
+        if fast_plaid_index is None:
+            raise RuntimeError(f"Fast-Plaid index not found at {FAST_PLAID_INDEX_PATH}")
+
+    results, search_secs = _search_fast_plaid(fast_plaid_index, q_vec, TOPK)
+    print(f"[Timing] Fast-Plaid Search: {search_secs:.3f}s")
+else:
+    # Original LEANN search
+    query_np = q_vec.float().numpy()
+
+    if SEARCH_METHOD == "ann":
+        results = retriever.search(query_np, topk=TOPK, first_stage_k=FIRST_STAGE_K)
+        search_secs = time.perf_counter() - _t0
+        print(f"[Timing] Search (ANN): {search_secs:.3f}s (first_stage_k={FIRST_STAGE_K})")
+    elif SEARCH_METHOD == "exact":
+        results = retriever.search_exact(query_np, topk=TOPK, first_stage_k=FIRST_STAGE_K)
+        search_secs = time.perf_counter() - _t0
+        print(f"[Timing] Search (Exact rerank): {search_secs:.3f}s (first_stage_k={FIRST_STAGE_K})")
+    elif SEARCH_METHOD == "exact-all":
+        results = retriever.search_exact_all(query_np, topk=TOPK)
+        search_secs = time.perf_counter() - _t0
+        print(f"[Timing] Search (Exact all): {search_secs:.3f}s")
+    else:
+        results = []
 if not results:
    print("No results found.")
 else:
    print(f'Top {len(results)} results for query: "{QUERY}"')
+    print("\n[DEBUG] Retrieval details:")
    top_images: list[Image.Image] = []
+    image_hashes = {}  # Track image hashes to detect duplicates
+
    for rank, (score, doc_id) in enumerate(results, start=1):
-        path = filepaths[doc_id]
-        # For HF dataset, path is a descriptive identifier, not a real file path
-        print(f"{rank}) MaxSim: {score:.4f}, Page: {path}")
-        top_images.append(images[doc_id])
+        # Retrieve image and metadata based on index type
+        if USE_FAST_PLAID:
+            # Fast-Plaid: load image and get metadata
+            image = _get_fast_plaid_image(FAST_PLAID_INDEX_PATH, doc_id)
+            if image is None:
+                print(f"Warning: Could not find image for doc_id {doc_id}")
+                continue
+
+            metadata = _get_fast_plaid_metadata(FAST_PLAID_INDEX_PATH, doc_id)
+            path = metadata.get("filepath", f"doc_{doc_id}") if metadata else f"doc_{doc_id}"
+            top_images.append(image)
+        else:
+            # Original LEANN: retrieve from retriever
+            image = retriever.get_image(doc_id)
+            if image is None:
+                print(f"Warning: Could not retrieve image for doc_id {doc_id}")
+                continue
+
+            metadata = retriever.get_metadata(doc_id)
+            path = metadata.get("filepath", "unknown") if metadata else "unknown"
+            top_images.append(image)
+
+        # Calculate image hash to detect duplicates
+        import hashlib
+        import io
+        # Convert image to bytes for hashing
+        img_bytes = io.BytesIO()
+        image.save(img_bytes, format='PNG')
+        image_bytes = img_bytes.getvalue()
+        image_hash = hashlib.md5(image_bytes).hexdigest()[:8]
+
+        # Check if this image was already seen
+        duplicate_info = ""
+        if image_hash in image_hashes:
+            duplicate_info = f" [DUPLICATE of rank {image_hashes[image_hash]}]"
+        else:
+            image_hashes[image_hash] = rank
+
+        # Print detailed information
+        print(f"{rank}) doc_id={doc_id}, MaxSim={score:.4f}, Page={path}, ImageHash={image_hash}{duplicate_info}")
+        if metadata:
+            print(f"   Metadata: {metadata}")

    if SAVE_TOP_IMAGE:
        from pathlib import Path as _Path
@@ -430,12 +614,17 @@ else:
            else:
                out_path = base / f"retrieved_page_rank{rank}.png"
            img.save(str(out_path))
-            print(f"Saved retrieved page (rank {rank}) to: {out_path}")
+            # Print the retrieval score (document-level MaxSim) alongside the saved path
+            try:
+                score, _doc_id = results[rank - 1]
+                print(f"Saved retrieved page (rank {rank}) [MaxSim={score:.4f}] to: {out_path}")
+            except Exception:
+                print(f"Saved retrieved page (rank {rank}) to: {out_path}")

 ## TODO stange results of second page of DeepSeek-V2 rather than the first page

 # %%
-# Step 5: Similarity maps for top-K results
+# Step 6: Similarity maps for top-K results
 if results and SIMILARITY_MAP:
    token_idx = None if SIM_TOKEN_IDX < 0 else int(SIM_TOKEN_IDX)
    from pathlib import Path as _Path
@@ -472,9 +661,12 @@ if results and SIMILARITY_MAP:


 # %%
-# Step 6: Optional answer generation
+# Step 7: Optional answer generation
 if results and ANSWER:
    qwen = QwenVL(device=device_str)
+    _t0 = time.perf_counter()
    response = qwen.answer(QUERY, top_images[:TOPK], max_new_tokens=MAX_NEW_TOKENS)
+    gen_secs = time.perf_counter() - _t0
+    print(f"[Timing] Generation: {gen_secs:.3f}s")
    print("\nAnswer:")
    print(response)
--- a/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+"""
+Modular script to reproduce NDCG results for ViDoRe v1 benchmark.
+
+This script uses the interface from leann_multi_vector.py to:
+1. Download ViDoRe v1 datasets
+2. Build indexes (LEANN or Fast-Plaid)
+3. Perform retrieval
+4. Evaluate using NDCG metrics
+
+Usage:
+    # Evaluate all ViDoRe v1 tasks
+    python vidore_v1_benchmark.py --model colqwen2 --tasks all
+
+    # Evaluate specific task
+    python vidore_v1_benchmark.py --model colqwen2 --task VidoreArxivQARetrieval
+
+    # Use Fast-Plaid index
+    python vidore_v1_benchmark.py --model colqwen2 --use-fast-plaid --fast-plaid-index-path ./indexes/vidore_fastplaid
+
+    # Rebuild index
+    python vidore_v1_benchmark.py --model colqwen2 --rebuild-index
+"""
+
+import argparse
+import json
+import os
+from typing import Optional
+
+from datasets import load_dataset
+from leann_multi_vector import (
+    ViDoReBenchmarkEvaluator,
+    _ensure_repo_paths_importable,
+)
+
+_ensure_repo_paths_importable(__file__)
+
+# ViDoRe v1 task configurations
+# Prompts match MTEB task metadata prompts
+VIDORE_V1_TASKS = {
+    "VidoreArxivQARetrieval": {
+        "dataset_path": "vidore/arxivqa_test_subsampled_beir",
+        "revision": "7d94d570960eac2408d3baa7a33f9de4822ae3e4",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreDocVQARetrieval": {
+        "dataset_path": "vidore/docvqa_test_subsampled_beir",
+        "revision": "162ba2fc1a8437eda8b6c37b240bc1c0f0deb092",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreInfoVQARetrieval": {
+        "dataset_path": "vidore/infovqa_test_subsampled_beir",
+        "revision": "b802cc5fd6c605df2d673a963667d74881d2c9a4",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreTabfquadRetrieval": {
+        "dataset_path": "vidore/tabfquad_test_subsampled_beir",
+        "revision": "61a2224bcd29b7b261a4892ff4c8bea353527a31",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreTatdqaRetrieval": {
+        "dataset_path": "vidore/tatdqa_test_beir",
+        "revision": "5feb5630fdff4d8d189ffedb2dba56862fdd45c0",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreShiftProjectRetrieval": {
+        "dataset_path": "vidore/shiftproject_test_beir",
+        "revision": "84a382e05c4473fed9cff2bbae95fe2379416117",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreSyntheticDocQAAIRetrieval": {
+        "dataset_path": "vidore/syntheticDocQA_artificial_intelligence_test_beir",
+        "revision": "2d9ebea5a1c6e9ef4a3b902a612f605dca11261c",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreSyntheticDocQAEnergyRetrieval": {
+        "dataset_path": "vidore/syntheticDocQA_energy_test_beir",
+        "revision": "9935aadbad5c8deec30910489db1b2c7133ae7a7",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreSyntheticDocQAGovernmentReportsRetrieval": {
+        "dataset_path": "vidore/syntheticDocQA_government_reports_test_beir",
+        "revision": "b4909afa930f81282fd20601e860668073ad02aa",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "VidoreSyntheticDocQAHealthcareIndustryRetrieval": {
+        "dataset_path": "vidore/syntheticDocQA_healthcare_industry_test_beir",
+        "revision": "f9e25d5b6e13e1ad9f5c3cce202565031b3ab164",
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+}
+
+
+def load_vidore_v1_data(
+    dataset_path: str,
+    revision: Optional[str] = None,
+    split: str = "test",
+):
+    """
+    Load ViDoRe v1 dataset.
+
+    Returns:
+        corpus: dict mapping corpus_id to PIL Image
+        queries: dict mapping query_id to query text
+        qrels: dict mapping query_id to dict of {corpus_id: relevance_score}
+    """
+    print(f"Loading dataset: {dataset_path} (split={split})")
+
+    # Load queries
+    query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
+
+    queries = {}
+    for row in query_ds:
+        query_id = f"query-{split}-{row['query-id']}"
+        queries[query_id] = row["query"]
+
+    # Load corpus (images)
+    corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
+
+    corpus = {}
+    for row in corpus_ds:
+        corpus_id = f"corpus-{split}-{row['corpus-id']}"
+        # Extract image from the dataset row
+        if "image" in row:
+            corpus[corpus_id] = row["image"]
+        elif "page_image" in row:
+            corpus[corpus_id] = row["page_image"]
+        else:
+            raise ValueError(
+                f"No image field found in corpus. Available fields: {list(row.keys())}"
+            )
+
+    # Load qrels (relevance judgments)
+    qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
+
+    qrels = {}
+    for row in qrels_ds:
+        query_id = f"query-{split}-{row['query-id']}"
+        corpus_id = f"corpus-{split}-{row['corpus-id']}"
+        if query_id not in qrels:
+            qrels[query_id] = {}
+        qrels[query_id][corpus_id] = int(row["score"])
+
+    print(
+        f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
+    )
+
+    # Filter qrels to only include queries that exist
+    qrels = {qid: rel_docs for qid, rel_docs in qrels.items() if qid in queries}
+
+    # Filter out queries without any relevant documents (matching MTEB behavior)
+    # This is important for correct NDCG calculation
+    qrels_filtered = {qid: rel_docs for qid, rel_docs in qrels.items() if len(rel_docs) > 0}
+    queries_filtered = {
+        qid: query_text for qid, query_text in queries.items() if qid in qrels_filtered
+    }
+
+    print(
+        f"After filtering queries without positives: {len(queries_filtered)} queries, {len(qrels_filtered)} query-relevance mappings"
+    )
+
+    return corpus, queries_filtered, qrels_filtered
+
+
+def evaluate_task(
+    task_name: str,
+    model_name: str,
+    index_path: str,
+    use_fast_plaid: bool = False,
+    fast_plaid_index_path: Optional[str] = None,
+    rebuild_index: bool = False,
+    top_k: int = 1000,
+    first_stage_k: int = 500,
+    k_values: Optional[list[int]] = None,
+    output_dir: Optional[str] = None,
+):
+    """
+    Evaluate a single ViDoRe v1 task.
+    """
+    print(f"\n{'=' * 80}")
+    print(f"Evaluating task: {task_name}")
+    print(f"{'=' * 80}")
+
+    # Get task config
+    if task_name not in VIDORE_V1_TASKS:
+        raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V1_TASKS.keys())}")
+
+    task_config = VIDORE_V1_TASKS[task_name]
+    dataset_path = task_config["dataset_path"]
+    revision = task_config["revision"]
+
+    # Load data
+    corpus, queries, qrels = load_vidore_v1_data(
+        dataset_path=dataset_path,
+        revision=revision,
+        split="test",
+    )
+
+    # Initialize k_values if not provided
+    if k_values is None:
+        k_values = [1, 3, 5, 10, 20, 100, 1000]
+
+    # Check if we have any queries
+    if len(queries) == 0:
+        print(f"\nWarning: No queries found for task {task_name}. Skipping evaluation.")
+        # Return zero scores
+        scores = {}
+        for k in k_values:
+            scores[f"ndcg_at_{k}"] = 0.0
+            scores[f"map_at_{k}"] = 0.0
+            scores[f"recall_at_{k}"] = 0.0
+            scores[f"precision_at_{k}"] = 0.0
+            scores[f"mrr_at_{k}"] = 0.0
+        return scores
+
+    # Initialize evaluator
+    evaluator = ViDoReBenchmarkEvaluator(
+        model_name=model_name,
+        use_fast_plaid=use_fast_plaid,
+        top_k=top_k,
+        first_stage_k=first_stage_k,
+        k_values=k_values,
+    )
+
+    # Build or load index
+    index_path_full = index_path if not use_fast_plaid else fast_plaid_index_path
+    if index_path_full is None:
+        index_path_full = f"./indexes/{task_name}_{model_name}"
+        if use_fast_plaid:
+            index_path_full = f"./indexes/{task_name}_{model_name}_fastplaid"
+
+    index_or_retriever, corpus_ids_ordered = evaluator.build_index_from_corpus(
+        corpus=corpus,
+        index_path=index_path_full,
+        rebuild=rebuild_index,
+    )
+
+    # Search queries
+    task_prompt = task_config.get("prompt")
+    results = evaluator.search_queries(
+        queries=queries,
+        corpus_ids=corpus_ids_ordered,
+        index_or_retriever=index_or_retriever,
+        fast_plaid_index_path=fast_plaid_index_path,
+        task_prompt=task_prompt,
+    )
+
+    # Evaluate
+    scores = evaluator.evaluate_results(results, qrels, k_values=k_values)
+
+    # Print results
+    print(f"\n{'=' * 80}")
+    print(f"Results for {task_name}:")
+    print(f"{'=' * 80}")
+    for metric, value in scores.items():
+        if isinstance(value, (int, float)):
+            print(f"  {metric}: {value:.5f}")
+
+    # Save results
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+        results_file = os.path.join(output_dir, f"{task_name}_results.json")
+        scores_file = os.path.join(output_dir, f"{task_name}_scores.json")
+
+        with open(results_file, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nSaved results to: {results_file}")
+
+        with open(scores_file, "w") as f:
+            json.dump(scores, f, indent=2)
+        print(f"Saved scores to: {scores_file}")
+
+    return scores
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate ViDoRe v1 benchmark using LEANN/Fast-Plaid indexing"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="colqwen2",
+        choices=["colqwen2", "colpali"],
+        help="Model to use",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default=None,
+        help="Specific task to evaluate (or 'all' for all tasks)",
+    )
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        default="all",
+        help="Tasks to evaluate: 'all' or comma-separated list",
+    )
+    parser.add_argument(
+        "--index-path",
+        type=str,
+        default=None,
+        help="Path to LEANN index (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--use-fast-plaid",
+        action="store_true",
+        help="Use Fast-Plaid instead of LEANN",
+    )
+    parser.add_argument(
+        "--fast-plaid-index-path",
+        type=str,
+        default=None,
+        help="Path to Fast-Plaid index (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--rebuild-index",
+        action="store_true",
+        help="Rebuild index even if it exists",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=1000,
+        help="Top-k results to retrieve (MTEB default is max(k_values)=1000)",
+    )
+    parser.add_argument(
+        "--first-stage-k",
+        type=int,
+        default=500,
+        help="First stage k for LEANN search",
+    )
+    parser.add_argument(
+        "--k-values",
+        type=str,
+        default="1,3,5,10,20,100,1000",
+        help="Comma-separated k values for evaluation (e.g., '1,3,5,10,100')",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./vidore_v1_results",
+        help="Output directory for results",
+    )
+
+    args = parser.parse_args()
+
+    # Parse k_values
+    k_values = [int(k.strip()) for k in args.k_values.split(",")]
+
+    # Determine tasks to evaluate
+    if args.task:
+        tasks_to_eval = [args.task]
+    elif args.tasks.lower() == "all":
+        tasks_to_eval = list(VIDORE_V1_TASKS.keys())
+    else:
+        tasks_to_eval = [t.strip() for t in args.tasks.split(",")]
+
+    print(f"Tasks to evaluate: {tasks_to_eval}")
+
+    # Evaluate each task
+    all_scores = {}
+    for task_name in tasks_to_eval:
+        try:
+            scores = evaluate_task(
+                task_name=task_name,
+                model_name=args.model,
+                index_path=args.index_path,
+                use_fast_plaid=args.use_fast_plaid,
+                fast_plaid_index_path=args.fast_plaid_index_path,
+                rebuild_index=args.rebuild_index,
+                top_k=args.top_k,
+                first_stage_k=args.first_stage_k,
+                k_values=k_values,
+                output_dir=args.output_dir,
+            )
+            all_scores[task_name] = scores
+        except Exception as e:
+            print(f"\nError evaluating {task_name}: {e}")
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Print summary
+    if all_scores:
+        print(f"\n{'=' * 80}")
+        print("SUMMARY")
+        print(f"{'=' * 80}")
+        for task_name, scores in all_scores.items():
+            print(f"\n{task_name}:")
+            # Print main metrics
+            for metric in ["ndcg_at_5", "ndcg_at_10", "ndcg_at_100", "map_at_10", "recall_at_10"]:
+                if metric in scores:
+                    print(f"  {metric}: {scores[metric]:.5f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/multimodal/vision-based-pdf-multi-vector/vidore_v2_benchmark.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/vidore_v2_benchmark.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python3
+"""
+Modular script to reproduce NDCG results for ViDoRe v2 benchmark.
+
+This script uses the interface from leann_multi_vector.py to:
+1. Download ViDoRe v2 datasets
+2. Build indexes (LEANN or Fast-Plaid)
+3. Perform retrieval
+4. Evaluate using NDCG metrics
+
+Usage:
+    # Evaluate all ViDoRe v2 tasks
+    python vidore_v2_benchmark.py --model colqwen2 --tasks all
+
+    # Evaluate specific task
+    python vidore_v2_benchmark.py --model colqwen2 --task Vidore2ESGReportsRetrieval
+
+    # Use Fast-Plaid index
+    python vidore_v2_benchmark.py --model colqwen2 --use-fast-plaid --fast-plaid-index-path ./indexes/vidore_fastplaid
+
+    # Rebuild index
+    python vidore_v2_benchmark.py --model colqwen2 --rebuild-index
+"""
+
+import argparse
+import json
+import os
+from typing import Optional
+
+from datasets import load_dataset
+from leann_multi_vector import (
+    ViDoReBenchmarkEvaluator,
+    _ensure_repo_paths_importable,
+)
+
+_ensure_repo_paths_importable(__file__)
+
+# Language name to dataset language field value mapping
+# Dataset uses ISO 639-3 + ISO 15924 format (e.g., "eng-Latn")
+LANGUAGE_MAPPING = {
+    "english": "eng-Latn",
+    "french": "fra-Latn",
+    "spanish": "spa-Latn",
+    "german": "deu-Latn",
+}
+
+# ViDoRe v2 task configurations
+# Prompts match MTEB task metadata prompts
+VIDORE_V2_TASKS = {
+    "Vidore2ESGReportsRetrieval": {
+        "dataset_path": "vidore/esg_reports_v2",
+        "revision": "0542c0d03da0ec1c8cbc517c8d78e7e95c75d3d3",
+        "languages": ["french", "spanish", "english", "german"],
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "Vidore2EconomicsReportsRetrieval": {
+        "dataset_path": "vidore/economics_reports_v2",
+        "revision": "b3e3a04b07fbbaffe79be49dabf92f691fbca252",
+        "languages": ["french", "spanish", "english", "german"],
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "Vidore2BioMedicalLecturesRetrieval": {
+        "dataset_path": "vidore/biomedical_lectures_v2",
+        "revision": "a29202f0da409034d651614d87cd8938d254e2ea",
+        "languages": ["french", "spanish", "english", "german"],
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+    "Vidore2ESGReportsHLRetrieval": {
+        "dataset_path": "vidore/esg_reports_human_labeled_v2",
+        "revision": "6d467dedb09a75144ede1421747e47cf036857dd",
+        # Note: This dataset doesn't have language filtering - all queries are English
+        "languages": None,  # No language filtering needed
+        "prompt": {"query": "Find a screenshot that relevant to the user's question."},
+    },
+}
+
+
+def load_vidore_v2_data(
+    dataset_path: str,
+    revision: Optional[str] = None,
+    split: str = "test",
+    language: Optional[str] = None,
+):
+    """
+    Load ViDoRe v2 dataset.
+
+    Returns:
+        corpus: dict mapping corpus_id to PIL Image
+        queries: dict mapping query_id to query text
+        qrels: dict mapping query_id to dict of {corpus_id: relevance_score}
+    """
+    print(f"Loading dataset: {dataset_path} (split={split}, language={language})")
+
+    # Load queries
+    query_ds = load_dataset(dataset_path, "queries", split=split, revision=revision)
+
+    # Check if dataset has language field before filtering
+    has_language_field = len(query_ds) > 0 and "language" in query_ds.column_names
+
+    if language and has_language_field:
+        # Map language name to dataset language field value (e.g., "english" -> "eng-Latn")
+        dataset_language = LANGUAGE_MAPPING.get(language, language)
+        query_ds_filtered = query_ds.filter(lambda x: x.get("language") == dataset_language)
+        # Check if filtering resulted in empty dataset
+        if len(query_ds_filtered) == 0:
+            print(
+                f"Warning: No queries found after filtering by language '{language}' (mapped to '{dataset_language}')."
+            )
+            # Try with original language value (dataset might use simple names like 'english')
+            print(f"Trying with original language value '{language}'...")
+            query_ds_filtered = query_ds.filter(lambda x: x.get("language") == language)
+            if len(query_ds_filtered) == 0:
+                # Try to get a sample to see actual language values
+                try:
+                    sample_ds = load_dataset(
+                        dataset_path, "queries", split=split, revision=revision
+                    )
+                    if len(sample_ds) > 0 and "language" in sample_ds.column_names:
+                        sample_langs = set(sample_ds["language"])
+                        print(f"Available language values in dataset: {sample_langs}")
+                except Exception:
+                    pass
+            else:
+                print(
+                    f"Found {len(query_ds_filtered)} queries using original language value '{language}'"
+                )
+        query_ds = query_ds_filtered
+
+    queries = {}
+    for row in query_ds:
+        query_id = f"query-{split}-{row['query-id']}"
+        queries[query_id] = row["query"]
+
+    # Load corpus (images)
+    corpus_ds = load_dataset(dataset_path, "corpus", split=split, revision=revision)
+
+    corpus = {}
+    for row in corpus_ds:
+        corpus_id = f"corpus-{split}-{row['corpus-id']}"
+        # Extract image from the dataset row
+        if "image" in row:
+            corpus[corpus_id] = row["image"]
+        elif "page_image" in row:
+            corpus[corpus_id] = row["page_image"]
+        else:
+            raise ValueError(
+                f"No image field found in corpus. Available fields: {list(row.keys())}"
+            )
+
+    # Load qrels (relevance judgments)
+    qrels_ds = load_dataset(dataset_path, "qrels", split=split, revision=revision)
+
+    qrels = {}
+    for row in qrels_ds:
+        query_id = f"query-{split}-{row['query-id']}"
+        corpus_id = f"corpus-{split}-{row['corpus-id']}"
+        if query_id not in qrels:
+            qrels[query_id] = {}
+        qrels[query_id][corpus_id] = int(row["score"])
+
+    print(
+        f"Loaded {len(queries)} queries, {len(corpus)} corpus items, {len(qrels)} query-relevance mappings"
+    )
+
+    # Filter qrels to only include queries that exist
+    qrels = {qid: rel_docs for qid, rel_docs in qrels.items() if qid in queries}
+
+    # Filter out queries without any relevant documents (matching MTEB behavior)
+    # This is important for correct NDCG calculation
+    qrels_filtered = {qid: rel_docs for qid, rel_docs in qrels.items() if len(rel_docs) > 0}
+    queries_filtered = {
+        qid: query_text for qid, query_text in queries.items() if qid in qrels_filtered
+    }
+
+    print(
+        f"After filtering queries without positives: {len(queries_filtered)} queries, {len(qrels_filtered)} query-relevance mappings"
+    )
+
+    return corpus, queries_filtered, qrels_filtered
+
+
+def evaluate_task(
+    task_name: str,
+    model_name: str,
+    index_path: str,
+    use_fast_plaid: bool = False,
+    fast_plaid_index_path: Optional[str] = None,
+    language: Optional[str] = None,
+    rebuild_index: bool = False,
+    top_k: int = 100,
+    first_stage_k: int = 500,
+    k_values: Optional[list[int]] = None,
+    output_dir: Optional[str] = None,
+):
+    """
+    Evaluate a single ViDoRe v2 task.
+    """
+    print(f"\n{'=' * 80}")
+    print(f"Evaluating task: {task_name}")
+    print(f"{'=' * 80}")
+
+    # Get task config
+    if task_name not in VIDORE_V2_TASKS:
+        raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V2_TASKS.keys())}")
+
+    task_config = VIDORE_V2_TASKS[task_name]
+    dataset_path = task_config["dataset_path"]
+    revision = task_config["revision"]
+
+    # Determine language
+    if language is None:
+        # Use first language if multiple available
+        languages = task_config.get("languages")
+        if languages is None:
+            # Task doesn't support language filtering (e.g., Vidore2ESGReportsHLRetrieval)
+            language = None
+        elif len(languages) == 1:
+            language = languages[0]
+        else:
+            language = None
+
+    # Initialize k_values if not provided
+    if k_values is None:
+        k_values = [1, 3, 5, 10, 100]
+
+    # Load data
+    corpus, queries, qrels = load_vidore_v2_data(
+        dataset_path=dataset_path,
+        revision=revision,
+        split="test",
+        language=language,
+    )
+
+    # Check if we have any queries
+    if len(queries) == 0:
+        print(
+            f"\nWarning: No queries found for task {task_name} with language {language}. Skipping evaluation."
+        )
+        # Return zero scores
+        scores = {}
+        for k in k_values:
+            scores[f"ndcg_at_{k}"] = 0.0
+            scores[f"map_at_{k}"] = 0.0
+            scores[f"recall_at_{k}"] = 0.0
+            scores[f"precision_at_{k}"] = 0.0
+            scores[f"mrr_at_{k}"] = 0.0
+        return scores
+
+    # Initialize evaluator
+    evaluator = ViDoReBenchmarkEvaluator(
+        model_name=model_name,
+        use_fast_plaid=use_fast_plaid,
+        top_k=top_k,
+        first_stage_k=first_stage_k,
+        k_values=k_values,
+    )
+
+    # Build or load index
+    index_path_full = index_path if not use_fast_plaid else fast_plaid_index_path
+    if index_path_full is None:
+        index_path_full = f"./indexes/{task_name}_{model_name}"
+        if use_fast_plaid:
+            index_path_full = f"./indexes/{task_name}_{model_name}_fastplaid"
+
+    index_or_retriever, corpus_ids_ordered = evaluator.build_index_from_corpus(
+        corpus=corpus,
+        index_path=index_path_full,
+        rebuild=rebuild_index,
+    )
+
+    # Search queries
+    task_prompt = task_config.get("prompt")
+    results = evaluator.search_queries(
+        queries=queries,
+        corpus_ids=corpus_ids_ordered,
+        index_or_retriever=index_or_retriever,
+        fast_plaid_index_path=fast_plaid_index_path,
+        task_prompt=task_prompt,
+    )
+
+    # Evaluate
+    scores = evaluator.evaluate_results(results, qrels, k_values=k_values)
+
+    # Print results
+    print(f"\n{'=' * 80}")
+    print(f"Results for {task_name}:")
+    print(f"{'=' * 80}")
+    for metric, value in scores.items():
+        if isinstance(value, (int, float)):
+            print(f"  {metric}: {value:.5f}")
+
+    # Save results
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+        results_file = os.path.join(output_dir, f"{task_name}_results.json")
+        scores_file = os.path.join(output_dir, f"{task_name}_scores.json")
+
+        with open(results_file, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nSaved results to: {results_file}")
+
+        with open(scores_file, "w") as f:
+            json.dump(scores, f, indent=2)
+        print(f"Saved scores to: {scores_file}")
+
+    return scores
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate ViDoRe v2 benchmark using LEANN/Fast-Plaid indexing"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="colqwen2",
+        choices=["colqwen2", "colpali"],
+        help="Model to use",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default=None,
+        help="Specific task to evaluate (or 'all' for all tasks)",
+    )
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        default="all",
+        help="Tasks to evaluate: 'all' or comma-separated list",
+    )
+    parser.add_argument(
+        "--index-path",
+        type=str,
+        default=None,
+        help="Path to LEANN index (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--use-fast-plaid",
+        action="store_true",
+        help="Use Fast-Plaid instead of LEANN",
+    )
+    parser.add_argument(
+        "--fast-plaid-index-path",
+        type=str,
+        default=None,
+        help="Path to Fast-Plaid index (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--rebuild-index",
+        action="store_true",
+        help="Rebuild index even if it exists",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default=None,
+        help="Language to evaluate (default: first available)",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=100,
+        help="Top-k results to retrieve",
+    )
+    parser.add_argument(
+        "--first-stage-k",
+        type=int,
+        default=500,
+        help="First stage k for LEANN search",
+    )
+    parser.add_argument(
+        "--k-values",
+        type=str,
+        default="1,3,5,10,100",
+        help="Comma-separated k values for evaluation (e.g., '1,3,5,10,100')",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./vidore_v2_results",
+        help="Output directory for results",
+    )
+
+    args = parser.parse_args()
+
+    # Parse k_values
+    k_values = [int(k.strip()) for k in args.k_values.split(",")]
+
+    # Determine tasks to evaluate
+    if args.task:
+        tasks_to_eval = [args.task]
+    elif args.tasks.lower() == "all":
+        tasks_to_eval = list(VIDORE_V2_TASKS.keys())
+    else:
+        tasks_to_eval = [t.strip() for t in args.tasks.split(",")]
+
+    print(f"Tasks to evaluate: {tasks_to_eval}")
+
+    # Evaluate each task
+    all_scores = {}
+    for task_name in tasks_to_eval:
+        try:
+            scores = evaluate_task(
+                task_name=task_name,
+                model_name=args.model,
+                index_path=args.index_path,
+                use_fast_plaid=args.use_fast_plaid,
+                fast_plaid_index_path=args.fast_plaid_index_path,
+                language=args.language,
+                rebuild_index=args.rebuild_index,
+                top_k=args.top_k,
+                first_stage_k=args.first_stage_k,
+                k_values=k_values,
+                output_dir=args.output_dir,
+            )
+            all_scores[task_name] = scores
+        except Exception as e:
+            print(f"\nError evaluating {task_name}: {e}")
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Print summary
+    if all_scores:
+        print(f"\n{'=' * 80}")
+        print("SUMMARY")
+        print(f"{'=' * 80}")
+        for task_name, scores in all_scores.items():
+            print(f"\n{task_name}:")
+            # Print main metrics
+            for metric in ["ndcg_at_5", "ndcg_at_10", "ndcg_at_100", "map_at_10", "recall_at_10"]:
+                if metric in scores:
+                    print(f"  {metric}: {scores[metric]:.5f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/leann-core/src/leann/chunking_utils.py
+++ b/packages/leann-core/src/leann/chunking_utils.py
@@ -5,12 +5,15 @@ Packaged within leann-core so installed wheels can import it reliably.

 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional

 from llama_index.core.node_parser import SentenceSplitter

 logger = logging.getLogger(__name__)

+# Flag to ensure AST token warning only shown once per session
+_ast_token_warning_shown = False
+

 def estimate_token_count(text: str) -> int:
    """
@@ -174,37 +177,44 @@ def create_ast_chunks(
    max_chunk_size: int = 512,
    chunk_overlap: int = 64,
    metadata_template: str = "default",
-) -> list[str]:
+) -> list[dict[str, Any]]:
    """Create AST-aware chunks from code documents using astchunk.

    Falls back to traditional chunking if astchunk is unavailable.
+
+    Returns:
+        List of dicts with {"text": str, "metadata": dict}
    """
    try:
        from astchunk import ASTChunkBuilder  # optional dependency
    except ImportError as e:
        logger.error(f"astchunk not available: {e}")
        logger.info("Falling back to traditional chunking for code files")
-        return create_traditional_chunks(documents, max_chunk_size, chunk_overlap)
+        return _traditional_chunks_as_dicts(documents, max_chunk_size, chunk_overlap)

    all_chunks = []
    for doc in documents:
        language = doc.metadata.get("language")
        if not language:
            logger.warning("No language detected; falling back to traditional chunking")
-            all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
+            all_chunks.extend(_traditional_chunks_as_dicts([doc], max_chunk_size, chunk_overlap))
            continue

        try:
-            # Warn if AST chunk size + overlap might exceed common token limits
+            # Warn once if AST chunk size + overlap might exceed common token limits
+            # Note: Actual truncation happens at embedding time with dynamic model limits
+            global _ast_token_warning_shown
            estimated_max_tokens = int(
                (max_chunk_size + chunk_overlap) * 1.2
            )  # Conservative estimate
-            if estimated_max_tokens > 512:
+            if estimated_max_tokens > 512 and not _ast_token_warning_shown:
                logger.warning(
                    f"AST chunk size ({max_chunk_size}) + overlap ({chunk_overlap}) = {max_chunk_size + chunk_overlap} chars "
                    f"may exceed 512 token limit (~{estimated_max_tokens} tokens estimated). "
-                    f"Consider reducing --ast-chunk-size to {int(400 / 1.2)} or --ast-chunk-overlap to {int(50 / 1.2)}"
+                    f"Consider reducing --ast-chunk-size to {int(400 / 1.2)} or --ast-chunk-overlap to {int(50 / 1.2)}. "
+                    f"Note: Chunks will be auto-truncated at embedding time based on your model's actual token limit."
                )
+                _ast_token_warning_shown = True

            configs = {
                "max_chunk_size": max_chunk_size,
@@ -229,17 +239,40 @@ def create_ast_chunks(

            chunks = chunk_builder.chunkify(code_content)
            for chunk in chunks:
+                chunk_text = None
+                astchunk_metadata = {}
+
                if hasattr(chunk, "text"):
                    chunk_text = chunk.text
-                elif isinstance(chunk, dict) and "text" in chunk:
-                    chunk_text = chunk["text"]
                elif isinstance(chunk, str):
                    chunk_text = chunk
+                elif isinstance(chunk, dict):
+                    # Handle astchunk format: {"content": "...", "metadata": {...}}
+                    if "content" in chunk:
+                        chunk_text = chunk["content"]
+                        astchunk_metadata = chunk.get("metadata", {})
+                    elif "text" in chunk:
+                        chunk_text = chunk["text"]
+                    else:
+                        chunk_text = str(chunk)  # Last resort
                else:
                    chunk_text = str(chunk)

                if chunk_text and chunk_text.strip():
-                    all_chunks.append(chunk_text.strip())
+                    # Extract document-level metadata
+                    doc_metadata = {
+                        "file_path": doc.metadata.get("file_path", ""),
+                        "file_name": doc.metadata.get("file_name", ""),
+                    }
+                    if "creation_date" in doc.metadata:
+                        doc_metadata["creation_date"] = doc.metadata["creation_date"]
+                    if "last_modified_date" in doc.metadata:
+                        doc_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
+
+                    # Merge document metadata + astchunk metadata
+                    combined_metadata = {**doc_metadata, **astchunk_metadata}
+
+                    all_chunks.append({"text": chunk_text.strip(), "metadata": combined_metadata})

            logger.info(
                f"Created {len(chunks)} AST chunks from {language} file: {doc.metadata.get('file_name', 'unknown')}"
@@ -247,15 +280,19 @@ def create_ast_chunks(
        except Exception as e:
            logger.warning(f"AST chunking failed for {language} file: {e}")
            logger.info("Falling back to traditional chunking")
-            all_chunks.extend(create_traditional_chunks([doc], max_chunk_size, chunk_overlap))
+            all_chunks.extend(_traditional_chunks_as_dicts([doc], max_chunk_size, chunk_overlap))

    return all_chunks


 def create_traditional_chunks(
    documents, chunk_size: int = 256, chunk_overlap: int = 128
-) -> list[str]:
-    """Create traditional text chunks using LlamaIndex SentenceSplitter."""
+) -> list[dict[str, Any]]:
+    """Create traditional text chunks using LlamaIndex SentenceSplitter.
+
+    Returns:
+        List of dicts with {"text": str, "metadata": dict}
+    """
    if chunk_size <= 0:
        logger.warning(f"Invalid chunk_size={chunk_size}, using default value of 256")
        chunk_size = 256
@@ -271,19 +308,40 @@ def create_traditional_chunks(
        paragraph_separator="\n\n",
    )

-    all_texts = []
+    result = []
    for doc in documents:
+        # Extract document-level metadata
+        doc_metadata = {
+            "file_path": doc.metadata.get("file_path", ""),
+            "file_name": doc.metadata.get("file_name", ""),
+        }
+        if "creation_date" in doc.metadata:
+            doc_metadata["creation_date"] = doc.metadata["creation_date"]
+        if "last_modified_date" in doc.metadata:
+            doc_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
+
        try:
            nodes = node_parser.get_nodes_from_documents([doc])
            if nodes:
-                all_texts.extend(node.get_content() for node in nodes)
+                for node in nodes:
+                    result.append({"text": node.get_content(), "metadata": doc_metadata})
        except Exception as e:
            logger.error(f"Traditional chunking failed for document: {e}")
            content = doc.get_content()
            if content and content.strip():
-                all_texts.append(content.strip())
+                result.append({"text": content.strip(), "metadata": doc_metadata})

-    return all_texts
+    return result
+
+
+def _traditional_chunks_as_dicts(
+    documents, chunk_size: int = 256, chunk_overlap: int = 128
+) -> list[dict[str, Any]]:
+    """Helper: Traditional chunking that returns dict format for consistency.
+
+    This is now just an alias for create_traditional_chunks for backwards compatibility.
+    """
+    return create_traditional_chunks(documents, chunk_size, chunk_overlap)


 def create_text_chunks(
@@ -295,8 +353,12 @@ def create_text_chunks(
    ast_chunk_overlap: int = 64,
    code_file_extensions: Optional[list[str]] = None,
    ast_fallback_traditional: bool = True,
-) -> list[str]:
-    """Create text chunks from documents with optional AST support for code files."""
+) -> list[dict[str, Any]]:
+    """Create text chunks from documents with optional AST support for code files.
+
+    Returns:
+        List of dicts with {"text": str, "metadata": dict}
+    """
    if not documents:
        logger.warning("No documents provided for chunking")
        return []
@@ -331,24 +393,17 @@ def create_text_chunks(
                logger.error(f"AST chunking failed: {e}")
                if ast_fallback_traditional:
                    all_chunks.extend(
-                        create_traditional_chunks(code_docs, chunk_size, chunk_overlap)
+                        _traditional_chunks_as_dicts(code_docs, chunk_size, chunk_overlap)
                    )
                else:
                    raise
        if text_docs:
-            all_chunks.extend(create_traditional_chunks(text_docs, chunk_size, chunk_overlap))
+            all_chunks.extend(_traditional_chunks_as_dicts(text_docs, chunk_size, chunk_overlap))
    else:
-        all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap)
+        all_chunks = _traditional_chunks_as_dicts(documents, chunk_size, chunk_overlap)

    logger.info(f"Total chunks created: {len(all_chunks)}")

-    # Validate chunk token limits (default to 512 for safety)
-    # This provides a safety net for embedding models with token limits
-    validated_chunks, num_truncated = validate_chunk_token_limits(all_chunks, max_tokens=512)
-
-    if num_truncated > 0:
-        logger.info(
-            f"Post-chunking validation: {num_truncated} chunks were truncated to fit 512 token limit"
-        )
-
-    return validated_chunks
+    # Note: Token truncation is now handled at embedding time with dynamic model limits
+    # See get_model_token_limit() and truncate_to_token_limit() in embedding_compute.py
+    return all_chunks
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -1279,13 +1279,8 @@ Examples:
                    ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True),
                )

-                # Note: AST chunking currently returns plain text chunks without metadata
-                # We preserve basic file info by associating chunks with their source documents
-                # For better metadata preservation, documents list order should be maintained
-                for chunk_text in chunk_texts:
-                    # TODO: Enhance create_text_chunks to return metadata alongside text
-                    # For now, we store chunks with empty metadata
-                    all_texts.append({"text": chunk_text, "metadata": {}})
+                # create_text_chunks now returns list[dict] with metadata preserved
+                all_texts.extend(chunk_texts)

            except ImportError as e:
                print(
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -10,72 +10,63 @@ import time
 from typing import Any, Optional

 import numpy as np
+import tiktoken
 import torch

 from .settings import resolve_ollama_host, resolve_openai_api_key, resolve_openai_base_url

+# Set up logger with proper level
+logger = logging.getLogger(__name__)
+LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
+log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
+logger.setLevel(log_level)

-def truncate_to_token_limit(texts: list[str], max_tokens: int = 512) -> list[str]:
-    """
-    Truncate texts to token limit using tiktoken or conservative character truncation.
-
-    Args:
-        texts: List of texts to truncate
-        max_tokens: Maximum tokens allowed per text
-
-    Returns:
-        List of truncated texts that should fit within token limit
-    """
-    try:
-        import tiktoken
-
-        encoder = tiktoken.get_encoding("cl100k_base")
-        truncated = []
-
-        for text in texts:
-            tokens = encoder.encode(text)
-            if len(tokens) > max_tokens:
-                # Truncate to max_tokens and decode back to text
-                truncated_tokens = tokens[:max_tokens]
-                truncated_text = encoder.decode(truncated_tokens)
-                truncated.append(truncated_text)
-                logger.warning(
-                    f"Truncated text from {len(tokens)} to {max_tokens} tokens "
-                    f"(from {len(text)} to {len(truncated_text)} characters)"
-                )
-            else:
-                truncated.append(text)
-        return truncated
-
-    except ImportError:
-        # Fallback: Conservative character truncation
-        # Assume worst case: 1.5 tokens per character for code content
-        char_limit = int(max_tokens / 1.5)
-        truncated = []
-
-        for text in texts:
-            if len(text) > char_limit:
-                truncated_text = text[:char_limit]
-                truncated.append(truncated_text)
-                logger.warning(
-                    f"Truncated text from {len(text)} to {char_limit} characters "
-                    f"(conservative estimate for {max_tokens} tokens)"
-                )
-            else:
-                truncated.append(text)
-        return truncated
+# Token limit registry for embedding models
+# Used as fallback when dynamic discovery fails (e.g., LM Studio, OpenAI)
+# Ollama models use dynamic discovery via /api/show
+EMBEDDING_MODEL_LIMITS = {
+    # Nomic models (common across servers)
+    "nomic-embed-text": 2048,  # Corrected from 512 - verified via /api/show
+    "nomic-embed-text-v1.5": 2048,
+    "nomic-embed-text-v2": 512,
+    # Other embedding models
+    "mxbai-embed-large": 512,
+    "all-minilm": 512,
+    "bge-m3": 8192,
+    "snowflake-arctic-embed": 512,
+    # OpenAI models
+    "text-embedding-3-small": 8192,
+    "text-embedding-3-large": 8192,
+    "text-embedding-ada-002": 8192,
+}


-def get_model_token_limit(model_name: str) -> int:
+def get_model_token_limit(
+    model_name: str,
+    base_url: Optional[str] = None,
+    default: int = 2048,
+) -> int:
    """
    Get token limit for a given embedding model.
+    Uses hybrid approach: dynamic discovery for Ollama, registry fallback for others.

    Args:
        model_name: Name of the embedding model
+        base_url: Base URL of the embedding server (for dynamic discovery)
+        default: Default token limit if model not found

    Returns:
-        Token limit for the model, defaults to 512 if unknown
+        Token limit for the model in tokens
    """
+    # Try Ollama dynamic discovery if base_url provided
+    if base_url:
+        # Detect Ollama servers by port or "ollama" in URL
+        if "11434" in base_url or "ollama" in base_url.lower():
+            limit = _query_ollama_context_limit(model_name, base_url)
+            if limit:
+                return limit
+
+    # Fallback to known model registry with version handling (from PR #154)
    # Handle versioned model names (e.g., "nomic-embed-text:latest" -> "nomic-embed-text")
    base_model_name = model_name.split(":")[0]

@@ -92,31 +83,111 @@ def get_model_token_limit(model_name: str) -> int:
        if known_model in base_model_name or base_model_name in known_model:
            return limit

-    # Default to conservative 512 token limit
-    logger.warning(f"Unknown model '{model_name}', using default 512 token limit")
-    return 512
+    # Default fallback
+    logger.warning(f"Unknown model '{model_name}', using default {default} token limit")
+    return default


-# Set up logger with proper level
-logger = logging.getLogger(__name__)
-LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
-log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
-logger.setLevel(log_level)
+def truncate_to_token_limit(texts: list[str], token_limit: int) -> list[str]:
+    """
+    Truncate texts to fit within token limit using tiktoken.
+
+    Args:
+        texts: List of text strings to truncate
+        token_limit: Maximum number of tokens allowed
+
+    Returns:
+        List of truncated texts (same length as input)
+    """
+    if not texts:
+        return []
+
+    # Use tiktoken with cl100k_base encoding
+    enc = tiktoken.get_encoding("cl100k_base")
+
+    truncated_texts = []
+    truncation_count = 0
+    total_tokens_removed = 0
+    max_original_length = 0
+
+    for i, text in enumerate(texts):
+        tokens = enc.encode(text)
+        original_length = len(tokens)
+
+        if original_length <= token_limit:
+            # Text is within limit, keep as is
+            truncated_texts.append(text)
+        else:
+            # Truncate to token_limit
+            truncated_tokens = tokens[:token_limit]
+            truncated_text = enc.decode(truncated_tokens)
+            truncated_texts.append(truncated_text)
+
+            # Track truncation statistics
+            truncation_count += 1
+            tokens_removed = original_length - token_limit
+            total_tokens_removed += tokens_removed
+            max_original_length = max(max_original_length, original_length)
+
+            # Log individual truncation at WARNING level (first few only)
+            if truncation_count <= 3:
+                logger.warning(
+                    f"Text {i + 1} truncated: {original_length} → {token_limit} tokens "
+                    f"({tokens_removed} tokens removed)"
+                )
+            elif truncation_count == 4:
+                logger.warning("Further truncation warnings suppressed...")
+
+    # Log summary at INFO level
+    if truncation_count > 0:
+        logger.warning(
+            f"Truncation summary: {truncation_count}/{len(texts)} texts truncated "
+            f"(removed {total_tokens_removed} tokens total, longest was {max_original_length} tokens)"
+        )
+    else:
+        logger.debug(
+            f"No truncation needed - all {len(texts)} texts within {token_limit} token limit"
+        )
+
+    return truncated_texts
+
+
+def _query_ollama_context_limit(model_name: str, base_url: str) -> Optional[int]:
+    """
+    Query Ollama /api/show for model context limit.
+
+    Args:
+        model_name: Name of the Ollama model
+        base_url: Base URL of the Ollama server
+
+    Returns:
+        Context limit in tokens if found, None otherwise
+    """
+    try:
+        import requests
+
+        response = requests.post(
+            f"{base_url}/api/show",
+            json={"name": model_name},
+            timeout=5,
+        )
+        if response.status_code == 200:
+            data = response.json()
+            if "model_info" in data:
+                # Look for *.context_length in model_info
+                for key, value in data["model_info"].items():
+                    if "context_length" in key and isinstance(value, int):
+                        logger.info(f"Detected {model_name} context limit: {value} tokens")
+                        return value
+    except Exception as e:
+        logger.debug(f"Failed to query Ollama context limit: {e}")
+
+    return None
+

 # Global model cache to avoid repeated loading
 _model_cache: dict[str, Any] = {}

-# Known embedding model token limits
-EMBEDDING_MODEL_LIMITS = {
-    "nomic-embed-text": 512,
-    "nomic-embed-text-v2": 512,
-    "mxbai-embed-large": 512,
-    "all-minilm": 512,
-    "bge-m3": 8192,
-    "snowflake-arctic-embed": 512,
-    # Add more models as needed
-}
-

 def compute_embeddings(
    texts: list[str],
@@ -814,15 +885,13 @@ def compute_embeddings_ollama(

    logger.info(f"Using batch size: {batch_size} for true batch processing")

-    # Get model token limit and apply truncation
-    token_limit = get_model_token_limit(model_name)
+    # Get model token limit and apply truncation before batching
+    token_limit = get_model_token_limit(model_name, base_url=resolved_host)
    logger.info(f"Model '{model_name}' token limit: {token_limit}")

-    # Apply token-aware truncation to all texts
-    truncated_texts = truncate_to_token_limit(texts, token_limit)
-    if len(truncated_texts) != len(texts):
-        logger.error("Truncation failed - text count mismatch")
-        truncated_texts = texts  # Fallback to original texts
+    # Apply truncation to all texts before batch processing
+    # Function logs truncation details internally
+    texts = truncate_to_token_limit(texts, token_limit)

    def get_batch_embeddings(batch_texts):
        """Get embeddings for a batch of texts using /api/embed endpoint."""
@@ -880,12 +949,12 @@ def compute_embeddings_ollama(

        return None, list(range(len(batch_texts)))

-    # Process truncated texts in batches
+    # Process texts in batches
    all_embeddings = []
    all_failed_indices = []

    # Setup progress bar if needed
-    show_progress = is_build or len(truncated_texts) > 10
+    show_progress = is_build or len(texts) > 10
    try:
        if show_progress:
            from tqdm import tqdm
@@ -893,7 +962,7 @@ def compute_embeddings_ollama(
        show_progress = False

    # Process batches
-    num_batches = (len(truncated_texts) + batch_size - 1) // batch_size
+    num_batches = (len(texts) + batch_size - 1) // batch_size

    if show_progress:
        batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings (batched)")
@@ -902,8 +971,8 @@ def compute_embeddings_ollama(

    for batch_idx in batch_iterator:
        start_idx = batch_idx * batch_size
-        end_idx = min(start_idx + batch_size, len(truncated_texts))
-        batch_texts = truncated_texts[start_idx:end_idx]
+        end_idx = min(start_idx + batch_size, len(texts))
+        batch_texts = texts[start_idx:end_idx]

        batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)

@@ -918,11 +987,11 @@ def compute_embeddings_ollama(

    # Handle failed embeddings
    if all_failed_indices:
-        if len(all_failed_indices) == len(truncated_texts):
+        if len(all_failed_indices) == len(texts):
            raise RuntimeError("Failed to compute any embeddings")

        logger.warning(
-            f"Failed to compute embeddings for {len(all_failed_indices)}/{len(truncated_texts)} texts"
+            f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
        )

        # Use zero embeddings as fallback for failed ones
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,8 @@ dependencies = [
    "tree-sitter-c-sharp>=0.20.0",
    "tree-sitter-typescript>=0.20.0",
    "torchvision>=0.23.0",
+    "einops",
+    "seaborn",
 ]

 [project.optional-dependencies]
--- a/tests/test_astchunk_integration.py
+++ b/tests/test_astchunk_integration.py
@@ -8,7 +8,7 @@ import subprocess
 import sys
 import tempfile
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import Mock, patch

 import pytest

@@ -116,8 +116,10 @@ class TestChunkingFunctions:
        chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)

        assert len(chunks) > 0
-        assert all(isinstance(chunk, str) for chunk in chunks)
-        assert all(len(chunk.strip()) > 0 for chunk in chunks)
+        # Traditional chunks now return dict format for consistency
+        assert all(isinstance(chunk, dict) for chunk in chunks)
+        assert all("text" in chunk and "metadata" in chunk for chunk in chunks)
+        assert all(len(chunk["text"].strip()) > 0 for chunk in chunks)

    def test_create_traditional_chunks_empty_docs(self):
        """Test traditional chunking with empty documents."""
@@ -158,11 +160,22 @@ class Calculator:

            # Should have multiple chunks due to different functions/classes
            assert len(chunks) > 0
-            assert all(isinstance(chunk, str) for chunk in chunks)
-            assert all(len(chunk.strip()) > 0 for chunk in chunks)
+            # R3: Expect dict format with "text" and "metadata" keys
+            assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
+            assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
+                "Each chunk should have 'text' and 'metadata' keys"
+            )
+            assert all(len(chunk["text"].strip()) > 0 for chunk in chunks), (
+                "Each chunk text should be non-empty"
+            )
+
+            # Check metadata is present
+            assert all("file_path" in chunk["metadata"] for chunk in chunks), (
+                "Each chunk should have file_path metadata"
+            )

            # Check that code structure is somewhat preserved
-            combined_content = " ".join(chunks)
+            combined_content = " ".join([c["text"] for c in chunks])
            assert "def hello_world" in combined_content
            assert "class Calculator" in combined_content

@@ -194,7 +207,11 @@ class Calculator:
        chunks = create_text_chunks(docs, use_ast_chunking=False, chunk_size=50, chunk_overlap=10)

        assert len(chunks) > 0
-        assert all(isinstance(chunk, str) for chunk in chunks)
+        # R3: Traditional chunking should also return dict format for consistency
+        assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
+        assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
+            "Each chunk should have 'text' and 'metadata' keys"
+        )

    def test_create_text_chunks_ast_mode(self):
        """Test text chunking in AST mode."""
@@ -213,7 +230,11 @@ class Calculator:
        )

        assert len(chunks) > 0
-        assert all(isinstance(chunk, str) for chunk in chunks)
+        # R3: AST mode should also return dict format
+        assert all(isinstance(chunk, dict) for chunk in chunks), "All chunks should be dicts"
+        assert all("text" in chunk and "metadata" in chunk for chunk in chunks), (
+            "Each chunk should have 'text' and 'metadata' keys"
+        )

    def test_create_text_chunks_custom_extensions(self):
        """Test text chunking with custom code file extensions."""
@@ -353,6 +374,552 @@ class MathUtils:
                pytest.skip("Test timed out - likely due to model download in CI")


+class TestASTContentExtraction:
+    """Test AST content extraction bug fix.
+
+    These tests verify that astchunk's dict format with 'content' key is handled correctly,
+    and that the extraction logic doesn't fall through to stringifying entire dicts.
+    """
+
+    def test_extract_content_from_astchunk_dict(self):
+        """Test that astchunk dict format with 'content' key is handled correctly.
+
+        Bug: Current code checks for chunk["text"] but astchunk returns chunk["content"].
+        This causes fallthrough to str(chunk), stringifying the entire dict.
+
+        This test will FAIL until the bug is fixed because:
+        - Current code will stringify the dict: "{'content': '...', 'metadata': {...}}"
+        - Fixed code should extract just the content value
+        """
+        # Mock the ASTChunkBuilder class
+        mock_builder = Mock()
+
+        # Astchunk returns this format
+        astchunk_format_chunk = {
+            "content": "def hello():\n    print('world')",
+            "metadata": {
+                "filepath": "test.py",
+                "line_count": 2,
+                "start_line_no": 0,
+                "end_line_no": 1,
+                "node_count": 1,
+            },
+        }
+        mock_builder.chunkify.return_value = [astchunk_format_chunk]
+
+        # Create mock document
+        doc = MockDocument(
+            "def hello():\n    print('world')", "/test/test.py", {"language": "python"}
+        )
+
+        # Mock the astchunk module and its ASTChunkBuilder class
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        # Patch sys.modules to inject our mock before the import
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            # Call create_ast_chunks
+            chunks = create_ast_chunks([doc])
+
+        # R3: Should return dict format with proper metadata
+        assert len(chunks) > 0, "Should return at least one chunk"
+
+        # R3: Each chunk should be a dict
+        chunk = chunks[0]
+        assert isinstance(chunk, dict), "Chunk should be a dict"
+        assert "text" in chunk, "Chunk should have 'text' key"
+        assert "metadata" in chunk, "Chunk should have 'metadata' key"
+
+        chunk_text = chunk["text"]
+
+        # CRITICAL: Should NOT contain stringified dict markers in the text field
+        # These assertions will FAIL with current buggy code
+        assert "'content':" not in chunk_text, (
+            f"Chunk text contains stringified dict - extraction failed! Got: {chunk_text[:100]}..."
+        )
+        assert "'metadata':" not in chunk_text, (
+            "Chunk text contains stringified metadata - extraction failed! "
+            f"Got: {chunk_text[:100]}..."
+        )
+        assert "{" not in chunk_text or "def hello" in chunk_text.split("{")[0], (
+            "Chunk text appears to be a stringified dict"
+        )
+
+        # Should contain actual content
+        assert "def hello()" in chunk_text, "Should extract actual code content"
+        assert "print('world')" in chunk_text, "Should extract complete code content"
+
+        # R3: Should preserve astchunk metadata
+        assert "filepath" in chunk["metadata"] or "file_path" in chunk["metadata"], (
+            "Should preserve file path metadata"
+        )
+
+    def test_extract_text_key_fallback(self):
+        """Test that 'text' key still works for backward compatibility.
+
+        Some chunks might use 'text' instead of 'content' - ensure backward compatibility.
+        This test should PASS even with current code.
+        """
+        mock_builder = Mock()
+
+        # Some chunks might use "text" key
+        text_key_chunk = {"text": "def legacy_function():\n    return True"}
+        mock_builder.chunkify.return_value = [text_key_chunk]
+
+        # Create mock document
+        doc = MockDocument(
+            "def legacy_function():\n    return True", "/test/legacy.py", {"language": "python"}
+        )
+
+        # Mock the astchunk module
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            # Call create_ast_chunks
+            chunks = create_ast_chunks([doc])
+
+        # R3: Should extract text correctly as dict format
+        assert len(chunks) > 0
+        chunk = chunks[0]
+        assert isinstance(chunk, dict), "Chunk should be a dict"
+        assert "text" in chunk, "Chunk should have 'text' key"
+
+        chunk_text = chunk["text"]
+
+        # Should NOT be stringified
+        assert "'text':" not in chunk_text, "Should not stringify dict with 'text' key"
+
+        # Should contain actual content
+        assert "def legacy_function()" in chunk_text
+        assert "return True" in chunk_text
+
+    def test_handles_string_chunks(self):
+        """Test that plain string chunks still work.
+
+        Some chunkers might return plain strings - verify these are preserved.
+        This test should PASS with current code.
+        """
+        mock_builder = Mock()
+
+        # Plain string chunk
+        plain_string_chunk = "def simple_function():\n    pass"
+        mock_builder.chunkify.return_value = [plain_string_chunk]
+
+        # Create mock document
+        doc = MockDocument(
+            "def simple_function():\n    pass", "/test/simple.py", {"language": "python"}
+        )
+
+        # Mock the astchunk module
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            # Call create_ast_chunks
+            chunks = create_ast_chunks([doc])
+
+        # R3: Should wrap string in dict format
+        assert len(chunks) > 0
+        chunk = chunks[0]
+        assert isinstance(chunk, dict), "Even string chunks should be wrapped in dict"
+        assert "text" in chunk, "Chunk should have 'text' key"
+
+        chunk_text = chunk["text"]
+
+        assert chunk_text == plain_string_chunk.strip(), (
+            "Should preserve plain string chunk content"
+        )
+        assert "def simple_function()" in chunk_text
+        assert "pass" in chunk_text
+
+    def test_multiple_chunks_with_mixed_formats(self):
+        """Test handling of multiple chunks with different formats.
+
+        Real-world scenario: astchunk might return a mix of formats.
+        This test will FAIL if any chunk with 'content' key gets stringified.
+        """
+        mock_builder = Mock()
+
+        # Mix of formats
+        mixed_chunks = [
+            {"content": "def first():\n    return 1", "metadata": {"line_count": 2}},
+            "def second():\n    return 2",  # Plain string
+            {"text": "def third():\n    return 3"},  # Old format
+            {"content": "class MyClass:\n    pass", "metadata": {"node_count": 1}},
+        ]
+        mock_builder.chunkify.return_value = mixed_chunks
+
+        # Create mock document
+        code = "def first():\n    return 1\n\ndef second():\n    return 2\n\ndef third():\n    return 3\n\nclass MyClass:\n    pass"
+        doc = MockDocument(code, "/test/mixed.py", {"language": "python"})
+
+        # Mock the astchunk module
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            # Call create_ast_chunks
+            chunks = create_ast_chunks([doc])
+
+        # R3: Should extract all chunks correctly as dicts
+        assert len(chunks) == 4, "Should extract all 4 chunks"
+
+        # Check each chunk
+        for i, chunk in enumerate(chunks):
+            assert isinstance(chunk, dict), f"Chunk {i} should be a dict"
+            assert "text" in chunk, f"Chunk {i} should have 'text' key"
+            assert "metadata" in chunk, f"Chunk {i} should have 'metadata' key"
+
+            chunk_text = chunk["text"]
+            # None should be stringified dicts
+            assert "'content':" not in chunk_text, f"Chunk {i} text is stringified (has 'content':)"
+            assert "'metadata':" not in chunk_text, (
+                f"Chunk {i} text is stringified (has 'metadata':)"
+            )
+            assert "'text':" not in chunk_text, f"Chunk {i} text is stringified (has 'text':)"
+
+        # Verify actual content is present
+        combined = "\n".join([c["text"] for c in chunks])
+        assert "def first()" in combined
+        assert "def second()" in combined
+        assert "def third()" in combined
+        assert "class MyClass:" in combined
+
+    def test_empty_content_value_handling(self):
+        """Test handling of chunks with empty content values.
+
+        Edge case: chunk has 'content' key but value is empty.
+        Should skip these chunks, not stringify them.
+        """
+        mock_builder = Mock()
+
+        chunks_with_empty = [
+            {"content": "", "metadata": {"line_count": 0}},  # Empty content
+            {"content": "   ", "metadata": {"line_count": 1}},  # Whitespace only
+            {"content": "def valid():\n    return True", "metadata": {"line_count": 2}},  # Valid
+        ]
+        mock_builder.chunkify.return_value = chunks_with_empty
+
+        doc = MockDocument(
+            "def valid():\n    return True", "/test/empty.py", {"language": "python"}
+        )
+
+        # Mock the astchunk module
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            chunks = create_ast_chunks([doc])
+
+        # R3: Should only have the valid chunk (empty ones filtered out)
+        assert len(chunks) == 1, "Should filter out empty content chunks"
+
+        chunk = chunks[0]
+        assert isinstance(chunk, dict), "Chunk should be a dict"
+        assert "text" in chunk, "Chunk should have 'text' key"
+        assert "def valid()" in chunk["text"]
+
+        # Should not have stringified the empty dict
+        assert "'content': ''" not in chunk["text"]
+
+
+class TestASTMetadataPreservation:
+    """Test metadata preservation in AST chunk dictionaries.
+
+    R3: These tests define the contract for metadata preservation when returning
+    chunk dictionaries instead of plain strings. Each chunk dict should have:
+    - "text": str - the actual chunk content
+    - "metadata": dict - all metadata from document AND astchunk
+
+    These tests will FAIL until G3 implementation changes return type to list[dict].
+    """
+
+    def test_ast_chunks_preserve_file_metadata(self):
+        """Test that document metadata is preserved in chunk metadata.
+
+        This test verifies that all document-level metadata (file_path, file_name,
+        creation_date, last_modified_date) is included in each chunk's metadata dict.
+
+        This will FAIL because current code returns list[str], not list[dict].
+        """
+        # Create mock document with rich metadata
+        python_code = '''
+def calculate_sum(numbers):
+    """Calculate sum of numbers."""
+    return sum(numbers)
+
+class DataProcessor:
+    """Process data records."""
+
+    def process(self, data):
+        return [x * 2 for x in data]
+'''
+        doc = MockDocument(
+            python_code,
+            file_path="/project/src/utils.py",
+            metadata={
+                "language": "python",
+                "file_path": "/project/src/utils.py",
+                "file_name": "utils.py",
+                "creation_date": "2024-01-15T10:30:00",
+                "last_modified_date": "2024-10-31T15:45:00",
+            },
+        )
+
+        # Mock astchunk to return chunks with metadata
+        mock_builder = Mock()
+        astchunk_chunks = [
+            {
+                "content": "def calculate_sum(numbers):\n    return sum(numbers)",
+                "metadata": {
+                    "filepath": "/project/src/utils.py",
+                    "line_count": 2,
+                    "start_line_no": 1,
+                    "end_line_no": 2,
+                    "node_count": 1,
+                },
+            },
+            {
+                "content": "class DataProcessor:\n    def process(self, data):\n        return [x * 2 for x in data]",
+                "metadata": {
+                    "filepath": "/project/src/utils.py",
+                    "line_count": 3,
+                    "start_line_no": 5,
+                    "end_line_no": 7,
+                    "node_count": 2,
+                },
+            },
+        ]
+        mock_builder.chunkify.return_value = astchunk_chunks
+
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            chunks = create_ast_chunks([doc])
+
+        # CRITICAL: These assertions will FAIL with current list[str] return type
+        assert len(chunks) == 2, "Should return 2 chunks"
+
+        for i, chunk in enumerate(chunks):
+            # Structure assertions - WILL FAIL: current code returns strings
+            assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
+            assert "text" in chunk, f"Chunk {i} must have 'text' key"
+            assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
+            assert isinstance(chunk["metadata"], dict), f"Chunk {i} metadata should be dict"
+
+            # Document metadata preservation - WILL FAIL
+            metadata = chunk["metadata"]
+            assert "file_path" in metadata, f"Chunk {i} should preserve file_path"
+            assert metadata["file_path"] == "/project/src/utils.py", (
+                f"Chunk {i} file_path incorrect"
+            )
+
+            assert "file_name" in metadata, f"Chunk {i} should preserve file_name"
+            assert metadata["file_name"] == "utils.py", f"Chunk {i} file_name incorrect"
+
+            assert "creation_date" in metadata, f"Chunk {i} should preserve creation_date"
+            assert metadata["creation_date"] == "2024-01-15T10:30:00", (
+                f"Chunk {i} creation_date incorrect"
+            )
+
+            assert "last_modified_date" in metadata, f"Chunk {i} should preserve last_modified_date"
+            assert metadata["last_modified_date"] == "2024-10-31T15:45:00", (
+                f"Chunk {i} last_modified_date incorrect"
+            )
+
+        # Verify metadata is consistent across chunks from same document
+        assert chunks[0]["metadata"]["file_path"] == chunks[1]["metadata"]["file_path"], (
+            "All chunks from same document should have same file_path"
+        )
+
+        # Verify text content is present and not stringified
+        assert "def calculate_sum" in chunks[0]["text"]
+        assert "class DataProcessor" in chunks[1]["text"]
+
+    def test_ast_chunks_include_astchunk_metadata(self):
+        """Test that astchunk-specific metadata is merged into chunk metadata.
+
+        This test verifies that astchunk's metadata (line_count, start_line_no,
+        end_line_no, node_count) is merged with document metadata.
+
+        This will FAIL because current code returns list[str], not list[dict].
+        """
+        python_code = '''
+def function_one():
+    """First function."""
+    x = 1
+    y = 2
+    return x + y
+
+def function_two():
+    """Second function."""
+    return 42
+'''
+        doc = MockDocument(
+            python_code,
+            file_path="/test/code.py",
+            metadata={
+                "language": "python",
+                "file_path": "/test/code.py",
+                "file_name": "code.py",
+            },
+        )
+
+        # Mock astchunk with detailed metadata
+        mock_builder = Mock()
+        astchunk_chunks = [
+            {
+                "content": "def function_one():\n    x = 1\n    y = 2\n    return x + y",
+                "metadata": {
+                    "filepath": "/test/code.py",
+                    "line_count": 4,
+                    "start_line_no": 1,
+                    "end_line_no": 4,
+                    "node_count": 5,  # function, assignments, return
+                },
+            },
+            {
+                "content": "def function_two():\n    return 42",
+                "metadata": {
+                    "filepath": "/test/code.py",
+                    "line_count": 2,
+                    "start_line_no": 7,
+                    "end_line_no": 8,
+                    "node_count": 2,  # function, return
+                },
+            },
+        ]
+        mock_builder.chunkify.return_value = astchunk_chunks
+
+        mock_astchunk = Mock()
+        mock_astchunk.ASTChunkBuilder = Mock(return_value=mock_builder)
+
+        with patch.dict("sys.modules", {"astchunk": mock_astchunk}):
+            chunks = create_ast_chunks([doc])
+
+        # CRITICAL: These will FAIL with current list[str] return
+        assert len(chunks) == 2
+
+        # First chunk - function_one
+        chunk1 = chunks[0]
+        assert isinstance(chunk1, dict), "Chunk should be dict"
+        assert "metadata" in chunk1
+
+        metadata1 = chunk1["metadata"]
+
+        # Check astchunk metadata is present
+        assert "line_count" in metadata1, "Should include astchunk line_count"
+        assert metadata1["line_count"] == 4, "line_count should be 4"
+
+        assert "start_line_no" in metadata1, "Should include astchunk start_line_no"
+        assert metadata1["start_line_no"] == 1, "start_line_no should be 1"
+
+        assert "end_line_no" in metadata1, "Should include astchunk end_line_no"
+        assert metadata1["end_line_no"] == 4, "end_line_no should be 4"
+
+        assert "node_count" in metadata1, "Should include astchunk node_count"
+        assert metadata1["node_count"] == 5, "node_count should be 5"
+
+        # Second chunk - function_two
+        chunk2 = chunks[1]
+        metadata2 = chunk2["metadata"]
+
+        assert metadata2["line_count"] == 2, "line_count should be 2"
+        assert metadata2["start_line_no"] == 7, "start_line_no should be 7"
+        assert metadata2["end_line_no"] == 8, "end_line_no should be 8"
+        assert metadata2["node_count"] == 2, "node_count should be 2"
+
+        # Verify document metadata is ALSO present (merged, not replaced)
+        assert metadata1["file_path"] == "/test/code.py"
+        assert metadata1["file_name"] == "code.py"
+        assert metadata2["file_path"] == "/test/code.py"
+        assert metadata2["file_name"] == "code.py"
+
+        # Verify text content is correct
+        assert "def function_one" in chunk1["text"]
+        assert "def function_two" in chunk2["text"]
+
+    def test_traditional_chunks_as_dicts_helper(self):
+        """Test the helper function that wraps traditional chunks as dicts.
+
+        This test verifies that when create_traditional_chunks is called,
+        its plain string chunks are wrapped into dict format with metadata.
+
+        This will FAIL because the helper function _traditional_chunks_as_dicts()
+        doesn't exist yet, and create_traditional_chunks returns list[str].
+        """
+        # Create documents with various metadata
+        docs = [
+            MockDocument(
+                "This is the first paragraph of text. It contains multiple sentences. "
+                "This should be split into chunks based on size.",
+                file_path="/docs/readme.txt",
+                metadata={
+                    "file_path": "/docs/readme.txt",
+                    "file_name": "readme.txt",
+                    "creation_date": "2024-01-01",
+                },
+            ),
+            MockDocument(
+                "Second document with different metadata. It also has content that needs chunking.",
+                file_path="/docs/guide.md",
+                metadata={
+                    "file_path": "/docs/guide.md",
+                    "file_name": "guide.md",
+                    "last_modified_date": "2024-10-31",
+                },
+            ),
+        ]
+
+        # Call create_traditional_chunks (which should now return list[dict])
+        chunks = create_traditional_chunks(docs, chunk_size=50, chunk_overlap=10)
+
+        # CRITICAL: Will FAIL - current code returns list[str]
+        assert len(chunks) > 0, "Should return chunks"
+
+        for i, chunk in enumerate(chunks):
+            # Structure assertions - WILL FAIL
+            assert isinstance(chunk, dict), f"Chunk {i} should be dict, got {type(chunk)}"
+            assert "text" in chunk, f"Chunk {i} must have 'text' key"
+            assert "metadata" in chunk, f"Chunk {i} must have 'metadata' key"
+
+            # Text should be non-empty
+            assert len(chunk["text"].strip()) > 0, f"Chunk {i} text should be non-empty"
+
+            # Metadata should include document info
+            metadata = chunk["metadata"]
+            assert "file_path" in metadata, f"Chunk {i} should have file_path in metadata"
+            assert "file_name" in metadata, f"Chunk {i} should have file_name in metadata"
+
+        # Verify metadata tracking works correctly
+        # At least one chunk should be from readme.txt
+        readme_chunks = [c for c in chunks if "readme.txt" in c["metadata"]["file_name"]]
+        assert len(readme_chunks) > 0, "Should have chunks from readme.txt"
+
+        # At least one chunk should be from guide.md
+        guide_chunks = [c for c in chunks if "guide.md" in c["metadata"]["file_name"]]
+        assert len(guide_chunks) > 0, "Should have chunks from guide.md"
+
+        # Verify creation_date is preserved for readme chunks
+        for chunk in readme_chunks:
+            assert chunk["metadata"].get("creation_date") == "2024-01-01", (
+                "readme.txt chunks should preserve creation_date"
+            )
+
+        # Verify last_modified_date is preserved for guide chunks
+        for chunk in guide_chunks:
+            assert chunk["metadata"].get("last_modified_date") == "2024-10-31", (
+                "guide.md chunks should preserve last_modified_date"
+            )
+
+        # Verify text content is present
+        all_text = " ".join([c["text"] for c in chunks])
+        assert "first paragraph" in all_text
+        assert "Second document" in all_text
+
+
 class TestErrorHandling:
    """Test error handling and edge cases."""

--- a/tests/test_token_truncation.py
+++ b/tests/test_token_truncation.py
@@ -0,0 +1,268 @@
+"""Unit tests for token-aware truncation functionality.
+
+This test suite defines the contract for token truncation functions that prevent
+500 errors from Ollama when text exceeds model token limits. These tests verify:
+
+1. Model token limit retrieval (known and unknown models)
+2. Text truncation behavior for single and multiple texts
+3. Token counting and truncation accuracy using tiktoken
+
+All tests are written in Red Phase - they should FAIL initially because the
+implementation does not exist yet.
+"""
+
+import pytest
+import tiktoken
+from leann.embedding_compute import (
+    EMBEDDING_MODEL_LIMITS,
+    get_model_token_limit,
+    truncate_to_token_limit,
+)
+
+
+class TestModelTokenLimits:
+    """Tests for retrieving model-specific token limits."""
+
+    def test_get_model_token_limit_known_model(self):
+        """Verify correct token limit is returned for known models.
+
+        Known models should return their specific token limits from
+        EMBEDDING_MODEL_LIMITS dictionary.
+        """
+        # Test nomic-embed-text (2048 tokens)
+        limit = get_model_token_limit("nomic-embed-text")
+        assert limit == 2048, "nomic-embed-text should have 2048 token limit"
+
+        # Test nomic-embed-text-v1.5 (2048 tokens)
+        limit = get_model_token_limit("nomic-embed-text-v1.5")
+        assert limit == 2048, "nomic-embed-text-v1.5 should have 2048 token limit"
+
+        # Test nomic-embed-text-v2 (512 tokens)
+        limit = get_model_token_limit("nomic-embed-text-v2")
+        assert limit == 512, "nomic-embed-text-v2 should have 512 token limit"
+
+        # Test OpenAI models (8192 tokens)
+        limit = get_model_token_limit("text-embedding-3-small")
+        assert limit == 8192, "text-embedding-3-small should have 8192 token limit"
+
+    def test_get_model_token_limit_unknown_model(self):
+        """Verify default token limit is returned for unknown models.
+
+        Unknown models should return the default limit (2048) to allow
+        operation with reasonable safety margin.
+        """
+        # Test with completely unknown model
+        limit = get_model_token_limit("unknown-model-xyz")
+        assert limit == 2048, "Unknown models should return default 2048"
+
+        # Test with empty string
+        limit = get_model_token_limit("")
+        assert limit == 2048, "Empty model name should return default 2048"
+
+    def test_get_model_token_limit_custom_default(self):
+        """Verify custom default can be specified for unknown models.
+
+        Allow callers to specify their own default token limit when
+        model is not in the known models dictionary.
+        """
+        limit = get_model_token_limit("unknown-model", default=4096)
+        assert limit == 4096, "Should return custom default for unknown models"
+
+        # Known model should ignore custom default
+        limit = get_model_token_limit("nomic-embed-text", default=4096)
+        assert limit == 2048, "Known model should ignore custom default"
+
+    def test_embedding_model_limits_dictionary_exists(self):
+        """Verify EMBEDDING_MODEL_LIMITS dictionary contains expected models.
+
+        The dictionary should be importable and contain at least the
+        known nomic models with correct token limits.
+        """
+        assert isinstance(EMBEDDING_MODEL_LIMITS, dict), "Should be a dictionary"
+        assert "nomic-embed-text" in EMBEDDING_MODEL_LIMITS, "Should contain nomic-embed-text"
+        assert "nomic-embed-text-v1.5" in EMBEDDING_MODEL_LIMITS, (
+            "Should contain nomic-embed-text-v1.5"
+        )
+        assert EMBEDDING_MODEL_LIMITS["nomic-embed-text"] == 2048
+        assert EMBEDDING_MODEL_LIMITS["nomic-embed-text-v1.5"] == 2048
+        assert EMBEDDING_MODEL_LIMITS["nomic-embed-text-v2"] == 512
+        # OpenAI models
+        assert EMBEDDING_MODEL_LIMITS["text-embedding-3-small"] == 8192
+
+
+class TestTokenTruncation:
+    """Tests for truncating texts to token limits."""
+
+    @pytest.fixture
+    def tokenizer(self):
+        """Provide tiktoken tokenizer for token counting verification."""
+        return tiktoken.get_encoding("cl100k_base")
+
+    def test_truncate_single_text_under_limit(self, tokenizer):
+        """Verify text under token limit remains unchanged.
+
+        When text is already within the token limit, it should be
+        returned unchanged with no truncation.
+        """
+        text = "This is a short text that is well under the token limit."
+        token_count = len(tokenizer.encode(text))
+        assert token_count < 100, f"Test setup: text should be short (has {token_count} tokens)"
+
+        # Truncate with generous limit
+        result = truncate_to_token_limit([text], token_limit=512)
+
+        assert len(result) == 1, "Should return same number of texts"
+        assert result[0] == text, "Text under limit should be unchanged"
+
+    def test_truncate_single_text_over_limit(self, tokenizer):
+        """Verify text over token limit is truncated correctly.
+
+        When text exceeds the token limit, it should be truncated to
+        fit within the limit while maintaining valid token boundaries.
+        """
+        # Create a text that definitely exceeds limit
+        text = "word " * 200  # ~200 tokens (each "word " is typically 1-2 tokens)
+        original_token_count = len(tokenizer.encode(text))
+        assert original_token_count > 50, (
+            f"Test setup: text should be long (has {original_token_count} tokens)"
+        )
+
+        # Truncate to 50 tokens
+        result = truncate_to_token_limit([text], token_limit=50)
+
+        assert len(result) == 1, "Should return same number of texts"
+        assert result[0] != text, "Text over limit should be truncated"
+        assert len(result[0]) < len(text), "Truncated text should be shorter"
+
+        # Verify truncated text is within token limit
+        truncated_token_count = len(tokenizer.encode(result[0]))
+        assert truncated_token_count <= 50, (
+            f"Truncated text should be ≤50 tokens, got {truncated_token_count}"
+        )
+
+    def test_truncate_multiple_texts_mixed_lengths(self, tokenizer):
+        """Verify multiple texts with mixed lengths are handled correctly.
+
+        When processing multiple texts:
+        - Texts under limit should remain unchanged
+        - Texts over limit should be truncated independently
+        - Output list should maintain same order and length
+        """
+        texts = [
+            "Short text.",  # Under limit
+            "word " * 200,  # Over limit
+            "Another short one.",  # Under limit
+            "token " * 150,  # Over limit
+        ]
+
+        # Verify test setup
+        for i, text in enumerate(texts):
+            token_count = len(tokenizer.encode(text))
+            if i in [1, 3]:
+                assert token_count > 50, f"Text {i} should be over limit (has {token_count} tokens)"
+            else:
+                assert token_count < 50, (
+                    f"Text {i} should be under limit (has {token_count} tokens)"
+                )
+
+        # Truncate with 50 token limit
+        result = truncate_to_token_limit(texts, token_limit=50)
+
+        assert len(result) == len(texts), "Should return same number of texts"
+
+        # Verify each text individually
+        for i, (original, truncated) in enumerate(zip(texts, result)):
+            token_count = len(tokenizer.encode(truncated))
+            assert token_count <= 50, f"Text {i} should be ≤50 tokens, got {token_count}"
+
+            # Short texts should be unchanged
+            if i in [0, 2]:
+                assert truncated == original, f"Short text {i} should be unchanged"
+            # Long texts should be truncated
+            else:
+                assert len(truncated) < len(original), f"Long text {i} should be truncated"
+
+    def test_truncate_empty_list(self):
+        """Verify empty input list returns empty output list.
+
+        Edge case: empty list should return empty list without errors.
+        """
+        result = truncate_to_token_limit([], token_limit=512)
+        assert result == [], "Empty input should return empty output"
+
+    def test_truncate_preserves_order(self, tokenizer):
+        """Verify truncation preserves original text order.
+
+        Output list should maintain the same order as input list,
+        regardless of which texts were truncated.
+        """
+        texts = [
+            "First text " * 50,  # Will be truncated
+            "Second text.",  # Won't be truncated
+            "Third text " * 50,  # Will be truncated
+        ]
+
+        result = truncate_to_token_limit(texts, token_limit=20)
+
+        assert len(result) == 3, "Should preserve list length"
+        # Check that order is maintained by looking for distinctive words
+        assert "First" in result[0], "First text should remain in first position"
+        assert "Second" in result[1], "Second text should remain in second position"
+        assert "Third" in result[2], "Third text should remain in third position"
+
+    def test_truncate_extremely_long_text(self, tokenizer):
+        """Verify extremely long texts are truncated efficiently.
+
+        Test with text that far exceeds token limit to ensure
+        truncation handles extreme cases without performance issues.
+        """
+        # Create very long text (simulate real-world scenario)
+        text = "token " * 5000  # ~5000+ tokens
+        original_token_count = len(tokenizer.encode(text))
+        assert original_token_count > 1000, "Test setup: text should be very long"
+
+        # Truncate to small limit
+        result = truncate_to_token_limit([text], token_limit=100)
+
+        assert len(result) == 1
+        truncated_token_count = len(tokenizer.encode(result[0]))
+        assert truncated_token_count <= 100, (
+            f"Should truncate to ≤100 tokens, got {truncated_token_count}"
+        )
+        assert len(result[0]) < len(text) // 10, "Should significantly reduce text length"
+
+    def test_truncate_exact_token_limit(self, tokenizer):
+        """Verify text at exactly token limit is handled correctly.
+
+        Edge case: text with exactly the token limit should either
+        remain unchanged or be safely truncated by 1 token.
+        """
+        # Create text with approximately 50 tokens
+        # We'll adjust to get exactly 50
+        target_tokens = 50
+        text = "word " * 50
+        tokens = tokenizer.encode(text)
+
+        # Adjust to get exactly target_tokens
+        if len(tokens) > target_tokens:
+            tokens = tokens[:target_tokens]
+            text = tokenizer.decode(tokens)
+        elif len(tokens) < target_tokens:
+            # Add more words
+            while len(tokenizer.encode(text)) < target_tokens:
+                text += "word "
+            tokens = tokenizer.encode(text)[:target_tokens]
+            text = tokenizer.decode(tokens)
+
+        # Verify we have exactly target_tokens
+        assert len(tokenizer.encode(text)) == target_tokens, (
+            "Test setup: should have exactly 50 tokens"
+        )
+
+        result = truncate_to_token_limit([text], token_limit=target_tokens)
+
+        assert len(result) == 1
+        result_tokens = len(tokenizer.encode(result[0]))
+        assert result_tokens <= target_tokens, (
+            f"Should be ≤{target_tokens} tokens, got {result_tokens}"
+        )
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
yichuan-w	5637270f02	fix: format colqwen_forward.py to pass pre-commit checks	2025-12-03 09:06:29 +00:00
yichuan-w	1c690e4a8a	reproduce docvqa results and add debug file	2025-12-03 08:54:55 +00:00
yichuan-w	07afe546ea	reproduce docvqa results	2025-11-14 10:22:42 +00:00
yichuan-w	ae3b8af3df	update vidore	2025-11-14 07:31:24 +00:00
yichuan-w	a9c014df9e	Add timing instrumentation and multi-dataset support for multi-vector retrieval - Add timing measurements for search operations (load and core time) - Increase embedding batch size from 1 to 32 for better performance - Add explicit memory cleanup with del all_embeddings - Support loading and merging multiple datasets with different splits - Add CLI arguments for search method selection (ann/exact/exact-all) - Auto-detect image field names across different dataset structures - Print candidate doc counts for performance monitoring 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-11-10 21:13:17 +00:00
yichuan-w	3766ad1fd2	robust multi-vector	2025-11-09 02:34:53 +00:00
ww26	c3aceed1e0	metadata reveal for ast-chunking; smart detection of seq length in ollama; auto adjust chunk length for ast to prevent silent truncation (#157 ) * feat: enhance token limits with dynamic discovery + AST metadata Improves upon upstream PR #154 with two major enhancements: 1. Hybrid Token Limit Discovery - Dynamic: Query Ollama /api/show for context limits - Fallback: Registry for LM Studio/OpenAI - Zero maintenance for Ollama users - Respects custom num_ctx settings 2. AST Metadata Preservation - create_ast_chunks() returns dict format with metadata - Preserves file_path, file_name, timestamps - Includes astchunk metadata (line numbers, node counts) - Fixes content extraction bug (checks "content" key) - Enables --show-metadata flag 3. Better Token Limits - nomic-embed-text: 2048 tokens (vs 512) - nomic-embed-text-v1.5: 2048 tokens - Added OpenAI models: 8192 tokens 4. Comprehensive Tests - 11 tests for token truncation - 545 new lines in test_astchunk_integration.py - All metadata preservation tests passing * fix: merge EMBEDDING_MODEL_LIMITS and remove redundant validation - Merged upstream's model list with our corrected token limits - Kept our corrected nomic-embed-text: 2048 (not 512) - Removed post-chunking validation (redundant with embedding-time truncation) - All tests passing except 2 pre-existing integration test failures * style: apply ruff formatting and restore PR #154 version handling - Remove duplicate truncate_to_token_limit and get_model_token_limit functions - Restore version handling logic (model:latest -> model) from PR #154 - Restore partial matching fallback for model name variations - Apply ruff formatting to all modified files - All 11 token truncation tests passing * style: sort imports alphabetically (pre-commit auto-fix) * fix: show AST token limit warning only once per session - Add module-level flag to track if warning shown - Prevents spam when processing multiple files - Add clarifying note that auto-truncation happens at embedding time - Addresses issue where warning appeared for every code file * enhance: add detailed logging for token truncation - Track and report truncation statistics (count, tokens removed, max length) - Show first 3 individual truncations with exact token counts - Provide comprehensive summary when truncation occurs - Use WARNING level for data loss visibility - Silent (DEBUG level only) when no truncation needed Replaces misleading "truncated where necessary" message that appeared even when nothing was truncated.	2025-11-08 17:37:31 -08:00