LEANN/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py

## Jupyter-style notebook script
# %%
# uv pip install matplotlib qwen_vl_utils
import os
from typing import Any, Optional

from PIL import Image
from tqdm import tqdm


from leann_multi_vector import (  # utility functions/classes
    _ensure_repo_paths_importable,
    _load_images_from_dir,
    _maybe_convert_pdf_to_images,
    _load_colvision,
    _embed_images,
    _embed_queries,
    _build_index,
    _load_retriever_if_index_exists,
    _generate_similarity_map,
    QwenVL,
)

_ensure_repo_paths_importable(__file__)

# %%
# Config
os.environ["TOKENIZERS_PARALLELISM"] = "false"
QUERY = "The paper talk about the latent video generative model and data curation in the related work part?"
MODEL: str = "colqwen2"  # "colpali" or "colqwen2"

# Data source: set to True to use the Hugging Face dataset example (recommended)
USE_HF_DATASET: bool = True
DATASET_NAME: str = "weaviate/arXiv-AI-papers-multi-vector"
DATASET_SPLIT: str = "train"
MAX_DOCS: Optional[int] = None  # limit number of pages to index; None = all

# Local pages (used when USE_HF_DATASET == False)
PDF: Optional[str] = None  # e.g., "./pdfs/2004.12832v2.pdf"
PAGES_DIR: str = "./pages"

# Index + retrieval settings
INDEX_PATH: str = "./indexes/colvision.leann"
TOPK: int = 3
FIRST_STAGE_K: int = 500
REBUILD_INDEX: bool = False

# Artifacts
SAVE_TOP_IMAGE: Optional[str] = "./figures/retrieved_page.png"
SIMILARITY_MAP: bool = True
SIM_TOKEN_IDX: int = 13  # -1 means auto-select the most salient token
SIM_OUTPUT: str = "./figures/similarity_map.png"
ANSWER: bool = True
MAX_NEW_TOKENS: int = 1024


# %%

# Step 1: Check if we can skip data loading (index already exists)
retriever: Optional[Any] = None
need_to_build_index = REBUILD_INDEX

if not REBUILD_INDEX:
    retriever = _load_retriever_if_index_exists(INDEX_PATH)
    if retriever is not None:
        print(f"✓ Index loaded from {INDEX_PATH}")
        print(f"✓ Images available at: {retriever._images_dir_path()}")
        need_to_build_index = False
    else:
        print(f"Index not found, will build new index")
        need_to_build_index = True

# Step 2: Load data only if we need to build the index
if need_to_build_index:
    print("Loading dataset...")
    if USE_HF_DATASET:
        from datasets import load_dataset

        dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
        N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
        filepaths: list[str] = []
        images: list[Image.Image] = []
        for i in tqdm(range(N), desc="Loading dataset", total=N):
            p = dataset[i]
            # Compose a descriptive identifier for printing later
            identifier = f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
            filepaths.append(identifier)
            images.append(p["page_image"])  # PIL Image
    else:
        _maybe_convert_pdf_to_images(PDF, PAGES_DIR)
        filepaths, images = _load_images_from_dir(PAGES_DIR)
        if not images:
            raise RuntimeError(
                f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
            )
    print(f"Loaded {len(images)} images")
else:
    print("Skipping dataset loading (using existing index)")
    filepaths = []  # Not needed when using existing index
    images = []  # Not needed when using existing index


# %%
# Step 3: Load model and processor (only if we need to build index or perform search)
model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
print(f"Using model={model_name}, device={device_str}, dtype={dtype}")


# %%

# %%
# Step 4: Build index if needed
if need_to_build_index and retriever is None:
    print("Building index...")
    doc_vecs = _embed_images(model, processor, images)
    retriever = _build_index(INDEX_PATH, doc_vecs, filepaths, images)
    print(f"✓ Index built and images saved to: {retriever._images_dir_path()}")
    # Clear memory
    del images, filepaths, doc_vecs

# Note: Images are now stored in the index, retriever will load them on-demand from disk


# %%
# Step 5: Embed query and search
q_vec = _embed_queries(model, processor, [QUERY])[0]
results = retriever.search(q_vec.float().numpy(), topk=TOPK)
if not results:
    print("No results found.")
else:
    print(f'Top {len(results)} results for query: "{QUERY}"')
    top_images: list[Image.Image] = []
    for rank, (score, doc_id) in enumerate(results, start=1):
        # Retrieve image from index instead of memory
        image = retriever.get_image(doc_id)
        if image is None:
            print(f"Warning: Could not retrieve image for doc_id {doc_id}")
            continue

        metadata = retriever.get_metadata(doc_id)
        path = metadata.get("filepath", "unknown") if metadata else "unknown"
        # For HF dataset, path is a descriptive identifier, not a real file path
        print(f"{rank}) MaxSim: {score:.4f}, Page: {path}")
        top_images.append(image)

    if SAVE_TOP_IMAGE:
        from pathlib import Path as _Path

        base = _Path(SAVE_TOP_IMAGE)
        base.parent.mkdir(parents=True, exist_ok=True)
        for rank, img in enumerate(top_images[:TOPK], start=1):
            if base.suffix:
                out_path = base.parent / f"{base.stem}_rank{rank}{base.suffix}"
            else:
                out_path = base / f"retrieved_page_rank{rank}.png"
            img.save(str(out_path))
            # Print the retrieval score (document-level MaxSim) alongside the saved path
            try:
                score, _doc_id = results[rank - 1]
                print(f"Saved retrieved page (rank {rank}) [MaxSim={score:.4f}] to: {out_path}")
            except Exception:
                print(f"Saved retrieved page (rank {rank}) to: {out_path}")

## TODO stange results of second page of DeepSeek-V2 rather than the first page

# %%
# Step 6: Similarity maps for top-K results
if results and SIMILARITY_MAP:
    token_idx = None if SIM_TOKEN_IDX < 0 else int(SIM_TOKEN_IDX)
    from pathlib import Path as _Path

    output_base = _Path(SIM_OUTPUT) if SIM_OUTPUT else None
    for rank, img in enumerate(top_images[:TOPK], start=1):
        if output_base:
            if output_base.suffix:
                out_dir = output_base.parent
                out_name = f"{output_base.stem}_rank{rank}{output_base.suffix}"
                out_path = str(out_dir / out_name)
            else:
                out_dir = output_base
                out_dir.mkdir(parents=True, exist_ok=True)
                out_path = str(out_dir / f"similarity_map_rank{rank}.png")
        else:
            out_path = None
        chosen_idx, max_sim = _generate_similarity_map(
            model=model,
            processor=processor,
            image=img,
            query=QUERY,
            token_idx=token_idx,
            output_path=out_path,
        )
        if out_path:
            print(
                f"Saved similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f}) to: {out_path}"
            )
        else:
            print(
                f"Computed similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f})"
            )


# %%
# Step 7: Optional answer generation
if results and ANSWER:
    qwen = QwenVL(device=device_str)
    response = qwen.answer(QUERY, top_images[:TOPK], max_new_tokens=MAX_NEW_TOKENS)
    print("\nAnswer:")
    print(response)