add colqwen stuff

2025-09-22 20:12:02 +00:00
parent d034e2195b
commit 72455bb269
6 changed files with 100057 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ demo/experiment_results/**/*.json
 *.eml
 *.emlx
 *.json
+*.png
 !.vscode/*.json
 *.sh
 *.txt
--- a/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py
@@ -0,0 +1,179 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+import numpy as np
+
+
+def _ensure_repo_paths_importable(current_file: str) -> None:
+    _repo_root = Path(current_file).resolve().parents[3]
+    _leann_core_src = _repo_root / "packages" / "leann-core" / "src"
+    _leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
+    if str(_leann_core_src) not in sys.path:
+        sys.path.append(str(_leann_core_src))
+    if str(_leann_hnsw_pkg) not in sys.path:
+        sys.path.append(str(_leann_hnsw_pkg))
+
+
+_ensure_repo_paths_importable(__file__)
+
+from leann_backend_hnsw.hnsw_backend import HNSWBuilder, HNSWSearcher  # noqa: E402
+
+
+class LeannMultiVector:
+    def __init__(
+        self,
+        index_path: str,
+        dim: int = 128,
+        distance_metric: str = "mips",
+        M: int = 16,
+        efConstruction: int = 500,
+        is_compact: bool = False,
+        is_recompute: bool = False,
+        embedding_model_name: str = "colvision",
+    ) -> None:
+        self.index_path = index_path
+        self.dim = dim
+        self.embedding_model_name = embedding_model_name
+        self._pending_items: List[dict] = []
+        self._backend_kwargs = {
+            "distance_metric": distance_metric,
+            "M": M,
+            "efConstruction": efConstruction,
+            "is_compact": is_compact,
+            "is_recompute": is_recompute,
+        }
+        self._labels_meta: List[dict] = []
+
+    def _meta_dict(self) -> dict:
+        return {
+            "version": "1.0",
+            "backend_name": "hnsw",
+            "embedding_model": self.embedding_model_name,
+            "embedding_mode": "custom",
+            "dimensions": self.dim,
+            "backend_kwargs": self._backend_kwargs,
+            "is_compact": self._backend_kwargs.get("is_compact", True),
+            "is_pruned": self._backend_kwargs.get("is_compact", True)
+            and self._backend_kwargs.get("is_recompute", True),
+        }
+
+    def create_collection(self) -> None:
+        path = Path(self.index_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+    def insert(self, data: dict) -> None:
+        self._pending_items.append(
+            {
+                "doc_id": int(data["doc_id"]),
+                "filepath": data.get("filepath", ""),
+                "colbert_vecs": [np.asarray(v, dtype=np.float32) for v in data["colbert_vecs"]],
+            }
+        )
+
+    def _labels_path(self) -> Path:
+        index_path_obj = Path(self.index_path)
+        return index_path_obj.parent / f"{index_path_obj.name}.labels.json"
+
+    def _meta_path(self) -> Path:
+        index_path_obj = Path(self.index_path)
+        return index_path_obj.parent / f"{index_path_obj.name}.meta.json"
+
+    def create_index(self) -> None:
+        if not self._pending_items:
+            return
+
+        embeddings: List[np.ndarray] = []
+        labels_meta: List[dict] = []
+
+        for item in self._pending_items:
+            doc_id = int(item["doc_id"])
+            filepath = item.get("filepath", "")
+            colbert_vecs = item["colbert_vecs"]
+            for seq_id, vec in enumerate(colbert_vecs):
+                vec_np = np.asarray(vec, dtype=np.float32)
+                embeddings.append(vec_np)
+                labels_meta.append(
+                    {
+                        "id": f"{doc_id}:{seq_id}",
+                        "doc_id": doc_id,
+                        "seq_id": int(seq_id),
+                        "filepath": filepath,
+                    }
+                )
+
+        if not embeddings:
+            return
+
+        embeddings_np = np.vstack(embeddings).astype(np.float32)
+
+        builder = HNSWBuilder(**{**self._backend_kwargs, "dimensions": self.dim})
+        ids = [str(i) for i in range(embeddings_np.shape[0])]
+        builder.build(embeddings_np, ids, self.index_path)
+
+        import json as _json
+        with open(self._meta_path(), "w", encoding="utf-8") as f:
+            _json.dump(self._meta_dict(), f, indent=2)
+        with open(self._labels_path(), "w", encoding="utf-8") as f:
+            _json.dump(labels_meta, f)
+
+        self._labels_meta = labels_meta
+
+    def _load_labels_meta_if_needed(self) -> None:
+        if self._labels_meta:
+            return
+        labels_path = self._labels_path()
+        if labels_path.exists():
+            import json as _json
+            with open(labels_path, encoding="utf-8") as f:
+                self._labels_meta = _json.load(f)
+
+    def search(self, data: np.ndarray, topk: int, first_stage_k: int = 50) -> List[Tuple[float, int]]:
+        if data.ndim == 1:
+            data = data.reshape(1, -1)
+        if data.dtype != np.float32:
+            data = data.astype(np.float32)
+
+        self._load_labels_meta_if_needed()
+
+        searcher = HNSWSearcher(self.index_path, meta=self._meta_dict())
+        raw = searcher.search(
+            data,
+            first_stage_k,
+            recompute_embeddings=False,
+            complexity=128,
+            beam_width=1,
+            prune_ratio=0.0,
+            batch_size=0,
+        )
+
+        labels = raw.get("labels")
+        distances = raw.get("distances")
+        if labels is None or distances is None:
+            return []
+
+        doc_scores: dict[int, float] = {}
+        B = len(labels)
+        for b in range(B):
+            per_doc_best: dict[int, float] = {}
+            for k, sid in enumerate(labels[b]):
+                try:
+                    idx = int(sid)
+                except Exception:
+                    continue
+                if 0 <= idx < len(self._labels_meta):
+                    doc_id = int(self._labels_meta[idx]["doc_id"])  # type: ignore[index]
+                else:
+                    continue
+                score = float(distances[b][k])
+                if (doc_id not in per_doc_best) or (score > per_doc_best[doc_id]):
+                    per_doc_best[doc_id] = score
+            for doc_id, best_score in per_doc_best.items():
+                doc_scores[doc_id] = doc_scores.get(doc_id, 0.0) + best_score
+
+        scores = sorted(((v, k) for k, v in doc_scores.items()), key=lambda x: x[0], reverse=True)
+        return scores[:topk] if len(scores) >= topk else scores
+
+
--- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colipali-rag.ipynb
+++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colipali-rag.ipynb
--- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colpali-native-weaviate.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colpali-native-weaviate.py
@@ -0,0 +1,611 @@
+# %% [markdown]
+# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/weaviate-features/multi-vector/multi-vector-colipali-rag.ipynb)
+
+# %% [markdown]
+# # Multimodal RAG over PDFs using ColQwen2, Qwen2.5, and Weaviate
+# 
+# This notebook demonstrates [Multimodal Retrieval-Augmented Generation (RAG)](https://weaviate.io/blog/multimodal-rag) over PDF documents.
+# We will be performing retrieval against a collection of PDF documents by embedding both the individual pages of the documents and our queries into the same multi-vector space, reducing the problem to approximate nearest-neighbor search on ColBERT-style multi-vector embeddings under the MaxSim similarity measure.
+# 
+# For this purpose, we will use
+# 
+# - **A multimodal [late-interaction model](https://weaviate.io/blog/late-interaction-overview)**, like ColPali and ColQwen2, to generate
+# embeddings. This tutorial uses the publicly available model
+# [ColQwen2-v1.0](https://huggingface.co/vidore/colqwen2-v1.0) with a permissive Apache 2.0 license.
+# - **A Weaviate [vector database](https://weaviate.io/blog/what-is-a-vector-database)**, which  has a [multi-vector feature](https://docs.weaviate.io/weaviate/tutorials/multi-vector-embeddings) to effectively index a collection of PDF documents and support textual queries against the contents of the documents, including both text and figures.
+# - **A vision language model (VLM)**, specifically [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), to support multimodal Retrieval-Augmented Generation (RAG).
+# 
+# Below, you can see the multimodal RAG system overview:
+# 
+# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/multimodal-rag-diagram.png?raw=1" width="700px"/>
+# 
+# First, the ingestion pipeline processes the PDF documents as images with the multimodal late-interaction model. The multi-vector embeddings are stored in a vector database.
+# Then at query time, the text query is processed by the same multimodal late-interaction model to retrieve the relevant documents.
+# The retrieved PDF files are then passed as visual context together with the original user query to the vision language model, which generates a response based on this information.
+# 
+
+# %% [markdown]
+# ## Prerequisites
+# 
+# To run this notebook, you will need a machine capable of running neural networks using 5-10 GB of memory.
+# The demonstration uses two different vision language models that both require several gigabytes of memory.
+# See the documentation for each individual model and the general PyTorch docs to figure out how to best run the models on your hardware.
+# 
+# For example, you can run it on:
+# 
+# - Google Colab (using the free-tier T4 GPU)
+# - or locally (tested on an M2 Pro Mac).
+# 
+# Furthermore, you will need an instance of Weaviate version >= `1.29.0`.
+# 
+
+# %% [markdown]
+# ## Step 1: Install required libraries
+# 
+# Let's begin by installing and importing the required libraries.
+# 
+# Note that you'll need Python `3.13`.
+
+# %%
+%%capture
+%pip install colpali_engine weaviate-client qwen_vl_utils
+%pip install -q -U "colpali-engine[interpretability]>=0.3.2,<0.4.0"
+
+# %%
+import os
+import torch
+import numpy as np
+
+from google.colab import userdata
+from datasets import load_dataset
+
+from transformers.utils.import_utils import is_flash_attn_2_available
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+
+from colpali_engine.models import ColQwen2, ColQwen2Processor
+#from colpali_engine.models import ColPali, ColPaliProcessor # uncomment if you prefer to use ColPali models instead of ColQwen2 models
+
+import weaviate
+from weaviate.classes.init import Auth
+import weaviate.classes.config as wc
+from weaviate.classes.config import Configure
+from weaviate.classes.query import MetadataQuery
+
+from qwen_vl_utils import process_vision_info
+import base64
+from io import BytesIO
+
+import matplotlib.pyplot as plt
+
+from colpali_engine.interpretability import (
+    get_similarity_maps_from_embeddings,
+    plot_all_similarity_maps,
+    plot_similarity_map,
+)
+
+
+# %% [markdown]
+# ## Step 2: Load the PDF dataset
+# 
+# Let's start with the data.
+# We're going to first load a PDF document dataset of the [top-40 most
+# cited AI papers on arXiv](https://arxiv.org/abs/2412.12121) from Hugging Face from the period 2023-01-01 to 2024-09-30.
+
+# %%
+dataset = load_dataset("weaviate/arXiv-AI-papers-multi-vector", split="train")
+
+# %%
+dataset
+
+# %%
+dataset[398]
+
+# %% [markdown]
+# Let's take a look at a sample document page from the loaded PDF dataset.
+
+# %%
+display(dataset[289]["page_image"])
+
+# %% [markdown]
+# ![Retrieved page](./figures/retrieved_page.png)
+
+# %% [markdown]
+# ## Step 3: Load the ColVision (ColPali or ColQwen2) model
+# 
+# The approach to generate embeddings for this tutorial is outlined in the paper [ColPali: Efficient Document Retrieval with Vision Language Models](https://arxiv.org/abs/2407.01449). The paper demonstrates that it is possible to simplify traditional approaches to preprocessing PDF documents for retrieval:
+# 
+# Traditional PDF processing in RAG systems involves using OCR (Optical Character Recognition) and layout detection software, and separate processing of text, tables, figures, and charts. Additionally, after text extraction, text processing also requires a chunking step. Instead, the ColPali method feeds images (screenshots) of entire PDF pages to a Vision Language Model that produces a ColBERT-style multi-vector embedding.
+# 
+# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/colipali_pipeline.jpeg?raw=1" width="700px"/>
+# 
+# There are different ColVision models, such as ColPali or ColQwen2, available, which mainly differ in the used encoders (Contextualized Late Interaction over Qwen2 vs. PaliGemma-3B). You can read more about the differences between ColPali and ColQwen2 in our [overview of late-interaction models](https://weaviate.io/blog/late-interaction-overview).
+# 
+# Let's load the [ColQwen2-v1.0](https://huggingface.co/vidore/colqwen2-v1.0) model for this tutorial.
+
+# %%
+# Get rid of process forking deadlock warnings.
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# %%
+if torch.cuda.is_available(): # If GPU available
+    device = "cuda:0"
+elif torch.backends.mps.is_available(): # If Apple Silicon available
+    device = "mps"
+else:
+    device = "cpu"
+
+if is_flash_attn_2_available():
+    attn_implementation = "flash_attention_2"
+else:
+    attn_implementation = "eager"
+
+print(f"Using device: {device}")
+print(f"Using attention implementation: {attn_implementation}")
+
+# %%
+model_name = "vidore/colqwen2-v1.0"
+
+# About a 5 GB download and similar memory usage.
+model = ColQwen2.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map=device,
+    attn_implementation=attn_implementation,
+).eval()
+
+# Load processor
+processor = ColQwen2Processor.from_pretrained(model_name)
+
+# %% [markdown]
+# This notebook uses the ColQwen2 model because it has a permissive Apache 2.0 license.
+# Alternatively, you can also use [ColPali](https://huggingface.co/vidore/colpali-v1.2), which has a Gemma license, or check out other available [ColVision models](https://github.com/illuin-tech/colpali). For a detailed comparison, you can also refer to [ViDoRe: The Visual Document Retrieval Benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard)
+# 
+# If you want to use ColPali instead of ColQwen2, you can comment out the above code cell and uncomment the code cell below.
+
+# %%
+#model_name = "vidore/colpali-v1.2"
+
+# Load model
+#colpali_model = ColPali.from_pretrained(
+#    model_name,
+#    torch_dtype=torch.bfloat16,
+#    device_map=device,
+#    attn_implementation=attn_implementation,
+#).eval()
+
+# Load processor
+#colpali_processor = ColPaliProcessor.from_pretrained(model_name)
+
+# %% [markdown]
+# Before we go further, let's familiarize ourselves with the ColQwen2 model. It can create multi-vector embeddings from both images and text queries. Below you can see examples of each.
+# 
+
+# %%
+# Sample image inputs
+images = [
+    dataset[0]["page_image"],
+    dataset[1]["page_image"],
+]
+
+# Process the inputs
+batch_images = processor.process_images(images).to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    query_embedding = model(**batch_images)
+
+print(query_embedding)
+print(query_embedding.shape)
+
+# %%
+# Sample query inputs
+queries = [
+    "A table with LLM benchmark results.",
+    "A figure detailing the architecture of a neural network.",
+]
+
+# Process the inputs
+batch_queries = processor.process_queries(queries).to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    query_embedding = model(**batch_queries)
+
+print(query_embedding)
+print(query_embedding.shape)
+
+# %% [markdown]
+# Let's write a class to wrap the multimodal late-interaction model and its embedding functionalities for convenience.
+# 
+# 
+
+# %%
+# A convenience class to wrap the embedding functionality 
+# of ColVision models like ColPali and ColQwen2 
+class ColVision:
+    def __init__(self, model, processor):
+        """Initialize with a loaded model and processor."""
+        self.model = model
+        self.processor = processor
+
+    # A batch size of one appears to be most performant when running on an M4.
+    # Note: Reducing the image resolution speeds up the vectorizer and produces
+    # fewer multi-vectors.
+    def multi_vectorize_image(self, img):
+        """Return the multi-vector image of the supplied PIL image."""
+        image_batch = self.processor.process_images([img]).to(self.model.device)
+        with torch.no_grad():
+            image_embedding = self.model(**image_batch)
+        return image_embedding[0]
+
+    def multi_vectorize_text(self, query):
+        """Return the multi-vector embedding of the query text string."""
+        query_batch = self.processor.process_queries([query]).to(self.model.device)
+        with torch.no_grad():
+            query_embedding = self.model(**query_batch)
+        return query_embedding[0]
+
+# Instantiate the model to be used below.
+colvision_embedder = ColVision(model, processor) # This will be instantiated after loading the model and processor
+
+# %% [markdown]
+# Let's verify that the embedding of images and queries works as intended.
+# 
+
+# %%
+# Sample image inputs
+images = dataset[0]["page_image"]
+
+page_embedding = colvision_embedder.multi_vectorize_image(images)
+print(page_embedding.shape)  # torch.Size([755, 128])
+
+queries = [
+    "A table with LLM benchmark results.",
+    "A figure detailing the architecture of a neural network.",
+]
+
+query_embeddings = [colvision_embedder.multi_vectorize_text(q) for q in queries]
+print(query_embeddings[0].shape)  # torch.Size([20, 128])
+
+# %% [markdown]
+# ## Step 4: Connect to a Weaviate vector database instance
+# 
+# Now, you will need to connect to a running Weaviate vector database cluster.
+# 
+# You can choose one of the following options:
+# 
+# 1. **Option 1:** You can create a 14-day free sandbox on the managed service [Weaviate Cloud (WCD)](https://console.weaviate.cloud/)
+# 2. **Option 2:** [Embedded Weaviate](https://docs.weaviate.io/deploy/installation-guides/embedded)
+# 3. **Option 3:** [Local deployment](https://docs.weaviate.io/deploy/installation-guides/docker-installation)
+# 4. [Other options](https://docs.weaviate.io/deploy)
+
+# %%
+# Option 1: Weaviate Cloud
+WCD_URL = os.environ["WEAVIATE_URL"] # Replace with your Weaviate cluster URL
+WCD_AUTH_KEY = os.environ["WEAVIATE_API_KEY"] # Replace with your cluster auth key
+
+# Uncomment if you are working in a Google Colab environment
+#WCD_URL = userdata.get("WEAVIATE_URL")
+#WCD_AUTH_KEY = userdata.get("WEAVIATE_API_KEY")
+
+# Weaviate Cloud Deployment
+client = weaviate.connect_to_weaviate_cloud(
+    cluster_url=WCD_URL,
+    auth_credentials=weaviate.auth.AuthApiKey(WCD_AUTH_KEY),
+)
+
+# Option 2: Embedded Weaviate instance
+# use if you want to explore Weaviate without any additional setup
+#client = weaviate.connect_to_embedded()
+
+# Option 3: Locally hosted instance of Weaviate via Docker or Kubernetes
+#!docker run --detach -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.29.0
+#client = weaviate.connect_to_local()
+
+print(client.is_ready())
+
+# %% [markdown]
+# For this tutorial, you will need the Weaviate `v1.29.0` or higher.
+# Let's make sure we have the required version:
+
+# %%
+client.get_meta()['version']
+
+# %% [markdown]
+# ## Step 5: Create a collection
+# 
+# Next, we will create a collection that will hold the embeddings of the images of the PDF document pages.
+# 
+# We will not define a built-in vectorizer but use the [Bring Your Own Vectors (BYOV) approach](https://docs.weaviate.io/weaviate/starter-guides/custom-vectors), where we manually embed queries and PDF documents at ingestions and query stage.
+# 
+# Additionally, if you are interested in using the [MUVERA encoding algorithm](https://weaviate.io/blog/muvera) for multi-vector embeddings, you can uncomment it in the code below.
+
+# %%
+collection_name = "PDFDocuments"
+
+# %%
+# Delete the collection if it already exists
+# Note: in practice, you shouldn't rerun this cell, as it deletes your data
+# in "PDFDocuments", and then you need to re-import it again.
+#if client.collections.exists(collection_name):
+#  client.collections.delete(collection_name)
+
+# Create a collection
+collection = client.collections.create(
+    name=collection_name,
+    properties=[
+        wc.Property(name="page_id", data_type=wc.DataType.INT),
+        wc.Property(name="dataset_index", data_type=wc.DataType.INT),
+        wc.Property(name="paper_title", data_type=wc.DataType.TEXT),
+        wc.Property(name="paper_arxiv_id", data_type=wc.DataType.TEXT),
+        wc.Property(name="page_number", data_type=wc.DataType.INT),
+    ],
+    vector_config=[
+        Configure.MultiVectors.self_provided(
+            name="colqwen",
+            #encoding=Configure.VectorIndex.MultiVector.Encoding.muvera(),
+            vector_index_config=Configure.VectorIndex.hnsw(
+                multi_vector=Configure.VectorIndex.MultiVector.multi_vector()
+            )
+    )]
+)
+
+# %% [markdown]
+# ## Step 6: Uploading the vectors to Weaviate
+# 
+# In this step, we're indexing the vectors into our Weaviate Collection in batches.
+# 
+# For each batch, the images are processed and encoded using the ColPali model, turning them into multi-vector embeddings.
+# These embeddings are then converted from tensors into lists of vectors, capturing key details from each image and creating a multi-vector representation for each document.
+# This setup works well with Weaviate's multivector capabilities.
+# 
+# After processing, the vectors and any metadata are uploaded to Weaviate, gradually building up the index.
+# You can lower or increase the `batch_size` depending on your available GPU resources.
+
+# %%
+# Map of page ids to images to support displaying the image corresponding to a
+# particular page id.
+page_images = {}
+
+with collection.batch.dynamic() as batch:
+    for i in range(len(dataset)):
+        p = dataset[i]
+        page_images[p["page_id"]] = p["page_image"]
+
+        batch.add_object(
+            properties={
+                "page_id": p["page_id"],
+                "paper_title": p["paper_title"],
+                "paper_arxiv_id": p["paper_arxiv_id"],
+                "page_number": p["page_number"],
+                },
+            vector={"colqwen": colvision_embedder.multi_vectorize_image(p["page_image"]).cpu().float().numpy().tolist()})
+
+        if i % 25 == 0:
+            print(f"Added {i+1}/{len(dataset)} Page objects to Weaviate.")
+
+    batch.flush()
+
+# Delete dataset after creating page_images dict to hold the images
+del dataset
+
+# %%
+len(collection)
+
+# %% [markdown]
+# ## Step 7: Multimodal Retrieval Query
+# 
+# As an example of what we are going to build, consider the following actual demo query and resulting PDF page from our collection (nearest neighbor):
+# 
+# - Query: "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
+# - Nearest neighbor:  "DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model" (arXiv: 2405.04434), Page: 1.
+# 
+
+# %%
+query = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
+
+# %% [markdown]
+# By inspecting the first page of the [DeepSeek-V2 paper](https://arxiv.org/abs/2405.04434), we see that it does indeed contain a figure that is relevant for answering our query:
+# 
+# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/deepseek_efficiency.jpeg?raw=1" width="700px"/>
+
+# %% [markdown]
+# Note: To avoid `OutOfMemoryError` on freely available resources like Google Colab, we will only retrieve a single document. If you have resources with more memory available, you can set the `limit`parameter to a higher value, like e.g., `limit=3` to increase the number of retrieved PDF pages.
+
+# %%
+response = collection.query.near_vector(
+    near_vector=colvision_embedder.multi_vectorize_text(query).cpu().float().numpy(),
+    target_vector="colqwen",
+    limit=1,
+    return_metadata=MetadataQuery(distance=True), # Needed to return MaxSim score
+)
+
+print(f"The most relevant documents for the query \"{query}\" by order of relevance:\n")
+result_images = []
+for i, o in enumerate(response.objects):
+    p = o.properties
+    print(
+        f"{i+1}) MaxSim: {-o.metadata.distance:.2f}, "
+        + f"Title: \"{p['paper_title']}\" "
+        + f"(arXiv: {p['paper_arxiv_id']}), "
+        + f"Page: {int(p['page_number'])}"
+    )
+    result_images.append(page_images[p["page_id"]])
+
+# %% [markdown]
+# The retrieved page with the highest MaxSim score is indeed the page with the figure we mentioned earlier.
+
+# %%
+closest_page_id = response.objects[0].properties['page_id']
+image = page_images[closest_page_id]
+display(image)
+
+# %% [markdown]
+# ![Retrieved page](./figures/retrieved_page.png)
+# 
+# Let's visualize the similarity maps for the retrieved PDF document page to see the semantic similarity between each token in the user query and the image patches. This is an optional step.
+
+# %%
+
+# Preprocess inputs
+batch_images = processor.process_images([image]).to(device)
+batch_queries = processor.process_queries([query]).to(device)
+
+# Forward passes
+with torch.no_grad():
+    image_embeddings = model.forward(**batch_images)
+    query_embeddings = model.forward(**batch_queries)
+
+# Get the number of image patches
+n_patches = processor.get_n_patches(
+    image_size=image.size,
+    spatial_merge_size=model.spatial_merge_size,
+)
+
+# Get the tensor mask to filter out the embeddings that are not related to the image
+image_mask = processor.get_image_mask(batch_images)
+
+# Generate the similarity maps
+batched_similarity_maps = get_similarity_maps_from_embeddings(
+    image_embeddings=image_embeddings,
+    query_embeddings=query_embeddings,
+    n_patches=n_patches,
+    image_mask=image_mask,
+)
+
+# Get the similarity map for our (only) input image
+similarity_maps = batched_similarity_maps[0]  # (query_length, n_patches_x, n_patches_y)
+
+print(f"Similarity map shape: (query_length, n_patches_x, n_patches_y) = {tuple(similarity_maps.shape)}")
+
+# %%
+# Remove the padding tokens and the query augmentation tokens
+query_content = processor.decode(batch_queries.input_ids[0])
+query_content = query_content.replace(processor.tokenizer.pad_token, "")
+query_content = query_content.replace(processor.query_augmentation_token, "").strip()
+
+# Retokenize the cleaned query
+query_tokens = processor.tokenizer.tokenize(query_content)
+
+# Use this cell output to choose a token using its index
+for idex, val in enumerate(query_tokens):
+    print(f"{idex}: {val}")
+
+# %% [markdown]
+# Let's check the similarity plot for the token "MA" in "LLaMA". (Note that similarity maps are created for each token separately.)
+
+# %%
+token_idx = 13
+
+fig, ax = plot_similarity_map(
+    image=image,
+    similarity_map=similarity_maps[token_idx],
+    figsize=(18, 18),
+    show_colorbar=False,
+)
+
+max_sim_score = similarity_maps[token_idx, :, :].max().item()
+ax.set_title(f"Token #{token_idx}: `{query_tokens[token_idx]}`. MaxSim score: {max_sim_score:.2f}", fontsize=14)
+
+plt.show()
+
+# %% [markdown]
+# ![Similarity map](./figures/similarity_map.png)
+
+# %%
+# Delete variables used for visualization
+del batched_similarity_maps, similarity_maps, n_patches, query_content, query_tokens, token_idx
+
+# %% [markdown]
+# ## Step 8: Extension to Multimodal RAG using Qwen2.5
+# 
+# The above example gives us the most relevant pages to begin looking at to answer our query. Let's extend this multimodal document retrieval pipeline to a multimodal RAG pipeline.
+# 
+# Vision language models (VLMs) are Large Language Models with vision capabilities. They are now powerful enough that we can give the query and relevant pages to such a model and have it produce an answer to our query in plain text.
+# 
+# To accomplish this we are going to feed the top results into the
+# state-of-the-art VLM [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct).
+
+# %%
+# Setting up Qwen2.5-VL-3B-Instruct for generating answers from a query string
+# plus a collection of (images of) PDF pages.
+
+class QwenVL:
+    def __init__(self):
+        # Adjust the settings to your available architecture, see the link
+        # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct for examples.
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        min_pixels = 256*28*28
+        max_pixels = 1280*28*28
+        self.processor = AutoProcessor.from_pretrained(
+            "Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+
+    def query_images(self, query, images):
+        """Generate a textual response to the query (text) based on the information in the supplied list of PIL images."""
+        # Preparation for inference.
+        # Convert the images to base64 strings.
+        content = []
+
+        for img in images:
+            buffer = BytesIO()
+            img.save(buffer, format="jpeg")
+            img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+            content.append({"type": "image", "image": f"data:image;base64,{img_base64}"})
+
+        content.append({"type": "text", "text": query})
+        messages = [{"role": "user", "content": content}]
+
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+
+        # Inference: Generation of the output.
+        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        return self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+
+# Instantiate the model to be used below.
+qwenvl = QwenVL()
+
+# %% [markdown]
+# The response from `Qwen2.5-VL-3B-Instruct` based on the retrieved PDF pages:
+
+# %%
+qwenvl.query_images(query, result_images)
+
+# %% [markdown]
+# As you can see, the multimodal RAG pipeline was able to answer the original query: "How does DeepSeek-V2 compare against the LLaMA family of LLMs?". For this, the ColQwen2 retrieval model retrieved the correct PDF page from the 
+# "DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model" paper and used both the text and visual from the retrieved PDF page to answer the question.
+
+# %% [markdown]
+# ## Summary
+# 
+# This notebook demonstrates a multimodal RAG pipeline over PDF documents using ColQwen2 for multi-vector embeddings, a Weaviate vector database for storage and retrieval, and Qwen2.5-VL-3B-Instruct for generating answers.
+
+# %% [markdown]
+# ## References
+# 
+# - Faysse, M., Sibille, H., Wu, T., Omrani, B., Viaud, G., Hudelot, C., Colombo, P. (2024). ColPali: Efficient Document Retrieval with Vision Language Models. arXiv. https://doi.org/10.48550/arXiv.2407.01449
+# - [ColPali GitHub repository](https://github.com/illuin-tech/colpali)
+# - [ColPali Cookbook](https://github.com/tonywu71/colpali-cookbooks)
+
+
--- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py
@@ -0,0 +1,449 @@
+## Jupyter-style notebook script
+#%%
+# uv pip install matplotlib qwen_vl_utils
+import os
+import re
+import sys
+from pathlib import Path
+from typing import List, Optional, Tuple, cast, Any
+
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+
+def _ensure_repo_paths_importable(current_file: str) -> None:
+    """Make local leann packages importable without installing (mirrors multi-vector-leann.py)."""
+    _repo_root = Path(current_file).resolve().parents[3]
+    _leann_core_src = _repo_root / "packages" / "leann-core" / "src"
+    _leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
+    if str(_leann_core_src) not in sys.path:
+        sys.path.append(str(_leann_core_src))
+    if str(_leann_hnsw_pkg) not in sys.path:
+        sys.path.append(str(_leann_hnsw_pkg))
+
+
+_ensure_repo_paths_importable(__file__)
+
+from leann_multi_vector import LeannMultiVector  # noqa: E402
+
+#%%
+# Config
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+QUERY = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
+MODEL: str = "colqwen2"  # "colpali" or "colqwen2"
+
+# Data source: set to True to use the Hugging Face dataset example (recommended)
+USE_HF_DATASET: bool = True
+DATASET_NAME: str = "weaviate/arXiv-AI-papers-multi-vector"
+DATASET_SPLIT: str = "train"
+MAX_DOCS: Optional[int] = None  # limit number of pages to index; None = all
+
+# Local pages (used when USE_HF_DATASET == False)
+PDF: Optional[str] = None  # e.g., "./pdfs/2004.12832v2.pdf"
+PAGES_DIR: str = "./pages"
+
+# Index + retrieval settings
+INDEX_PATH: str = "./indexes/colvision.leann"
+TOPK: int = 1
+FIRST_STAGE_K: int = 50
+REBUILD_INDEX: bool = False
+
+# Artifacts
+SAVE_TOP_IMAGE: Optional[str] = "./figures/retrieved_page.png"
+SIMILARITY_MAP: bool = True
+SIM_TOKEN_IDX: int = -1  # -1 means auto-select the most salient token
+SIM_OUTPUT: str = "./figures/similarity_map.png"
+ANSWER: bool = True
+MAX_NEW_TOKENS: int = 128
+
+
+#%%
+# Helpers
+def _natural_sort_key(name: str) -> int:
+    m = re.search(r"\d+", name)
+    return int(m.group()) if m else 0
+
+
+def _load_images_from_dir(pages_dir: str) -> Tuple[List[str], List[Image.Image]]:
+    filenames = [n for n in os.listdir(pages_dir) if n.lower().endswith((".png", ".jpg", ".jpeg"))]
+    filenames = sorted(filenames, key=_natural_sort_key)
+    filepaths = [os.path.join(pages_dir, n) for n in filenames]
+    images = [Image.open(p) for p in filepaths]
+    return filepaths, images
+
+
+def _maybe_convert_pdf_to_images(pdf_path: Optional[str], pages_dir: str, dpi: int = 200) -> None:
+    if not pdf_path:
+        return
+    os.makedirs(pages_dir, exist_ok=True)
+    try:
+        from pdf2image import convert_from_path
+    except Exception as e:
+        raise RuntimeError("pdf2image is required to convert PDF to images. Install via pip install pdf2image") from e
+    images = convert_from_path(pdf_path, dpi=dpi)
+    for i, image in enumerate(images):
+        image.save(os.path.join(pages_dir, f"page_{i + 1}.png"), "PNG")
+
+
+def _select_device_and_dtype():
+    import torch
+    from colpali_engine.utils.torch_utils import get_torch_device
+
+    device_str = (
+        "cuda"
+        if torch.cuda.is_available()
+        else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
+    )
+    device = get_torch_device(device_str)
+    # Stable dtype selection to avoid NaNs:
+    # - CUDA: prefer bfloat16 if supported, else float16
+    # - MPS: use float32 (fp16 on MPS can produce NaNs in some ops)
+    # - CPU: float32
+    if device_str == "cuda":
+        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        try:
+            torch.backends.cuda.matmul.allow_tf32 = True  # Better stability/perf on Ampere+
+        except Exception:
+            pass
+    elif device_str == "mps":
+        dtype = torch.float32
+    else:
+        dtype = torch.float32
+    return device_str, device, dtype
+
+
+def _load_colvision(model_choice: str):
+    import torch
+    from colpali_engine.models import ColPali
+    from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
+    from transformers.utils.import_utils import is_flash_attn_2_available
+    from colpali_engine.models import ColQwen2, ColQwen2Processor
+
+    device_str, device, dtype = _select_device_and_dtype()
+
+    if model_choice == "colqwen2":
+        model_name = "vidore/colqwen2-v1.0"
+        # On CPU/MPS we must avoid flash-attn and stay eager; on CUDA prefer flash-attn if available
+        attn_implementation = "flash_attention_2" if (device_str == "cuda" and is_flash_attn_2_available()) else "eager"
+        model = ColQwen2.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+            attn_implementation=attn_implementation,
+        ).eval()
+        processor = ColQwen2Processor.from_pretrained(model_name)
+    else:
+        model_name = "vidore/colpali-v1.2"
+        model = ColPali.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+        ).eval()
+        processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+
+    return model_name, model, processor, device_str, device, dtype
+
+
+def _embed_images(model, processor, images: List[Image.Image]) -> List[Any]:
+    import torch
+    from colpali_engine.utils.torch_utils import ListDataset
+    from torch.utils.data import DataLoader
+
+    # Ensure deterministic eval and autocast for stability
+    model.eval()
+
+    dataloader = DataLoader(
+        dataset=ListDataset[Image.Image](images),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: processor.process_images(x),
+    )
+
+    doc_vecs: List[Any] = []
+    for batch_doc in dataloader:
+        with torch.no_grad():
+            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
+            # autocast on CUDA for bf16/fp16; on CPU/MPS stay in fp32
+            if model.device.type == "cuda":
+                with torch.autocast(device_type="cuda", dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16):
+                    embeddings_doc = model(**batch_doc)
+            else:
+                embeddings_doc = model(**batch_doc)
+        doc_vecs.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+    return doc_vecs
+
+
+def _embed_queries(model, processor, queries: List[str]) -> List[Any]:
+    import torch
+    from colpali_engine.utils.torch_utils import ListDataset
+    from torch.utils.data import DataLoader
+
+    model.eval()
+
+    dataloader = DataLoader(
+        dataset=ListDataset[str](queries),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: processor.process_queries(x),
+    )
+
+    q_vecs: List[Any] = []
+    for batch_query in dataloader:
+        with torch.no_grad():
+            batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
+            if model.device.type == "cuda":
+                with torch.autocast(device_type="cuda", dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16):
+                    embeddings_query = model(**batch_query)
+            else:
+                embeddings_query = model(**batch_query)
+        q_vecs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+    return q_vecs
+
+
+def _build_index(index_path: str, doc_vecs: List[Any], filepaths: List[str]) -> LeannMultiVector:
+    dim = int(doc_vecs[0].shape[-1])
+    retriever = LeannMultiVector(index_path=index_path, dim=dim)
+    retriever.create_collection()
+    for i, vec in enumerate(doc_vecs):
+        data = {
+            "colbert_vecs": vec.float().numpy(),
+            "doc_id": i,
+            "filepath": filepaths[i],
+        }
+        retriever.insert(data)
+    retriever.create_index()
+    return retriever
+
+
+def _load_retriever_if_index_exists(index_path: str, dim: int) -> Optional[LeannMultiVector]:
+    index_base = Path(index_path)
+    # Rough heuristic: index dir exists AND meta+labels files exist
+    meta = index_base.parent / f"{index_base.name}.meta.json"
+    labels = index_base.parent / f"{index_base.name}.labels.json"
+    if index_base.exists() and meta.exists() and labels.exists():
+        return LeannMultiVector(index_path=index_path, dim=dim)
+    return None
+
+
+def _generate_similarity_map(
+    model,
+    processor,
+    image: Image.Image,
+    query: str,
+    token_idx: Optional[int] = None,
+    output_path: Optional[str] = None,
+) -> Tuple[int, float]:
+    import torch
+    from colpali_engine.interpretability import (
+        get_similarity_maps_from_embeddings,
+        plot_similarity_map,
+    )
+
+    batch_images = processor.process_images([image]).to(model.device)
+    batch_queries = processor.process_queries([query]).to(model.device)
+
+    with torch.no_grad():
+        image_embeddings = model.forward(**batch_images)
+        query_embeddings = model.forward(**batch_queries)
+
+    n_patches = processor.get_n_patches(
+        image_size=image.size,
+        spatial_merge_size=getattr(model, "spatial_merge_size", None),
+    )
+    image_mask = processor.get_image_mask(batch_images)
+
+    batched_similarity_maps = get_similarity_maps_from_embeddings(
+        image_embeddings=image_embeddings,
+        query_embeddings=query_embeddings,
+        n_patches=n_patches,
+        image_mask=image_mask,
+    )
+
+    similarity_maps = batched_similarity_maps[0]
+
+    # Determine token index if not provided: choose the token with highest max score
+    if token_idx is None:
+        per_token_max = similarity_maps.view(similarity_maps.shape[0], -1).max(dim=1).values
+        token_idx = int(per_token_max.argmax().item())
+
+    max_sim_score = similarity_maps[token_idx, :, :].max().item()
+
+    if output_path:
+        import matplotlib.pyplot as plt
+
+        fig, ax = plot_similarity_map(
+            image=image,
+            similarity_map=similarity_maps[token_idx],
+            figsize=(14, 14),
+            show_colorbar=False,
+        )
+        ax.set_title(f"Token #{token_idx}. MaxSim score: {max_sim_score:.2f}", fontsize=12)
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        plt.savefig(output_path, bbox_inches="tight")
+        plt.close(fig)
+
+    return token_idx, float(max_sim_score)
+
+
+class QwenVL:
+    def __init__(self, device: str):
+        from transformers.utils.import_utils import is_flash_attn_2_available
+        from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+
+        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "eager"
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            torch_dtype="auto",
+            device_map=device,
+            attn_implementation=attn_implementation,
+        )
+
+        min_pixels = 256 * 28 * 28
+        max_pixels = 1280 * 28 * 28
+        self.processor = AutoProcessor.from_pretrained(
+            "Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
+        )
+
+    def answer(self, query: str, images: List[Image.Image], max_new_tokens: int = 128) -> str:
+        from qwen_vl_utils import process_vision_info
+        import base64
+        from io import BytesIO
+
+        content = []
+        for img in images:
+            buffer = BytesIO()
+            img.save(buffer, format="jpeg")
+            img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+            content.append({"type": "image", "image": f"data:image;base64,{img_base64}"})
+        content.append({"type": "text", "text": query})
+        messages = [{"role": "user", "content": content}]
+
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = inputs.to(self.model.device)
+
+        generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        return self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+#%%
+
+# Step 1: Prepare data
+if USE_HF_DATASET:
+    from datasets import load_dataset
+
+    dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
+    N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
+    filepaths: List[str] = []
+    images: List[Image.Image] = []
+    for i in tqdm(range(N), desc="Loading dataset"):
+        p = dataset[i]
+        # Compose a descriptive identifier for printing later
+        identifier = (
+            f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
+        )
+        filepaths.append(identifier)
+        images.append(p["page_image"])  # PIL Image
+else:
+    _maybe_convert_pdf_to_images(PDF, PAGES_DIR)
+    filepaths, images = _load_images_from_dir(PAGES_DIR)
+    if not images:
+        raise RuntimeError(f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist.")
+
+
+#%%
+# Step 2: Load model and processor
+model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
+print(f"Using model={model_name}, device={device_str}, dtype={dtype}")
+
+
+#%%
+
+#%%
+# Step 3: Build or load index
+retriever: Optional[LeannMultiVector] = None
+if not REBUILD_INDEX:
+    try:
+        one_vec = _embed_images(model, processor, [images[0]])[0]
+        retriever = _load_retriever_if_index_exists(INDEX_PATH, dim=int(one_vec.shape[-1]))
+    except Exception:
+        retriever = None
+
+if retriever is None:
+    doc_vecs = _embed_images(model, processor, images)
+    retriever = _build_index(INDEX_PATH, doc_vecs, filepaths)
+
+
+
+#%%
+# Step 4: Embed query and search
+q_vec = _embed_queries(model, processor, [QUERY])[0]
+results = retriever.search(q_vec.float().numpy(), topk=TOPK, first_stage_k=FIRST_STAGE_K)
+if not results:
+    print("No results found.")
+else:
+    print(f'Top {len(results)} results for query: "{QUERY}"')
+    top_images: List[Image.Image] = []
+    for rank, (score, doc_id) in enumerate(results, start=1):
+        path = filepaths[doc_id]
+        # For HF dataset, path is a descriptive identifier, not a real file path
+        print(f"{rank}) MaxSim: {score:.4f}, Page: {path}")
+        top_images.append(images[doc_id])
+
+    if SAVE_TOP_IMAGE:
+        from pathlib import Path as _Path
+        base = _Path(SAVE_TOP_IMAGE)
+        base.parent.mkdir(parents=True, exist_ok=True)
+        for rank, img in enumerate(top_images[: TOPK], start=1):
+            if base.suffix:
+                out_path = base.parent / f"{base.stem}_rank{rank}{base.suffix}"
+            else:
+                out_path = base / f"retrieved_page_rank{rank}.png"
+            img.save(str(out_path))
+            print(f"Saved retrieved page (rank {rank}) to: {out_path}")
+
+
+#%%
+# Step 5: Similarity maps for top-K results
+if results and SIMILARITY_MAP:
+    token_idx = None if SIM_TOKEN_IDX < 0 else int(SIM_TOKEN_IDX)
+    from pathlib import Path as _Path
+    output_base = _Path(SIM_OUTPUT) if SIM_OUTPUT else None
+    for rank, img in enumerate(top_images[: TOPK], start=1):
+        if output_base:
+            if output_base.suffix:
+                out_dir = output_base.parent
+                out_name = f"{output_base.stem}_rank{rank}{output_base.suffix}"
+                out_path = str(out_dir / out_name)
+            else:
+                out_dir = output_base
+                out_dir.mkdir(parents=True, exist_ok=True)
+                out_path = str(out_dir / f"similarity_map_rank{rank}.png")
+        else:
+            out_path = None
+        chosen_idx, max_sim = _generate_similarity_map(
+            model=model,
+            processor=processor,
+            image=img,
+            query=QUERY,
+            token_idx=token_idx,
+            output_path=out_path,
+        )
+        if out_path:
+            print(f"Saved similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f}) to: {out_path}")
+        else:
+            print(f"Computed similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f})")
+
+
+#%%
+# Step 6: Optional answer generation
+if results and ANSWER:
+    qwen = QwenVL(device=device_str)
+    response = qwen.answer(QUERY, top_images[: TOPK], max_new_tokens=MAX_NEW_TOKENS)
+    print("\nAnswer:")
+    print(response)
+
+
--- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py
+++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py
@@ -0,0 +1,134 @@
+# pip install pdf2image
+# pip install pymilvus
+# pip install colpali_engine
+# pip install tqdm
+# pip install pillow
+
+#%%
+from pdf2image import convert_from_path
+
+pdf_path = "pdfs/2004.12832v2.pdf"
+images = convert_from_path(pdf_path)
+
+for i, image in enumerate(images):
+    image.save(f"pages/page_{i + 1}.png", "PNG")
+
+#%%
+import numpy as np
+import concurrent.futures
+import os
+from pathlib import Path
+
+# Make local leann packages importable without installing
+_repo_root = Path(__file__).resolve().parents[3]
+_leann_core_src = _repo_root / "packages" / "leann-core" / "src"
+_leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
+import sys
+if str(_leann_core_src) not in sys.path:
+    sys.path.append(str(_leann_core_src))
+if str(_leann_hnsw_pkg) not in sys.path:
+    sys.path.append(str(_leann_hnsw_pkg))
+
+from leann_multi_vector import LeannMultiVector
+
+
+class LeannRetriever(LeannMultiVector):
+    pass
+#%%
+from colpali_engine.models import ColPali
+from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
+from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor
+from colpali_engine.utils.torch_utils import ListDataset, get_torch_device
+from torch.utils.data import DataLoader
+import torch
+from typing import List, cast
+
+# Auto-select device: CUDA > MPS (mac) > CPU
+_device_str = (
+    "cuda" if torch.cuda.is_available() else (
+        "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu"
+    )
+)
+device = get_torch_device(_device_str)
+# Prefer fp16 on GPU/MPS, bfloat16 on CPU
+_dtype = (
+    torch.float16 if _device_str in ("cuda", "mps") else torch.bfloat16
+)
+model_name = "vidore/colpali-v1.2"
+
+model = ColPali.from_pretrained(
+    model_name,
+    torch_dtype=_dtype,
+    device_map=device,
+).eval()
+print(f"Using device={_device_str}, dtype={_dtype}")
+
+queries = [
+    "How to end-to-end retrieval with ColBert",
+    "Where is ColBERT performance Table, including text representation results?",
+]
+
+processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+
+dataloader = DataLoader(
+    dataset=ListDataset[str](queries),
+    batch_size=1,
+    shuffle=False,
+    collate_fn=lambda x: processor.process_queries(x),
+)
+
+qs: List[torch.Tensor] = []
+for batch_query in dataloader:
+    with torch.no_grad():
+        batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
+        embeddings_query = model(**batch_query)
+    qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+print(qs[0].shape)
+#%%
+
+
+from tqdm import tqdm
+from PIL import Image
+import os
+import re
+
+page_filenames = sorted(os.listdir("./pages"), key=lambda n: int(re.search(r"\d+", n).group()))
+images = [Image.open(os.path.join("./pages", name)) for name in page_filenames]
+
+dataloader = DataLoader(
+    dataset=ListDataset[str](images),
+    batch_size=1,
+    shuffle=False,
+    collate_fn=lambda x: processor.process_images(x),
+)
+
+ds: List[torch.Tensor] = []
+for batch_doc in tqdm(dataloader):
+    with torch.no_grad():
+        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
+        embeddings_doc = model(**batch_doc)
+    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+
+print(ds[0].shape)
+
+#%%
+# Build HNSW index via LeannRetriever primitives and run search
+index_path = "./indexes/colpali.leann"
+retriever = LeannRetriever(index_path=index_path, dim=int(ds[0].shape[-1]))
+retriever.create_collection()
+filepaths = [os.path.join("./pages", name) for name in page_filenames]
+for i in range(len(filepaths)):
+    data = {
+        "colbert_vecs": ds[i].float().numpy(),
+        "doc_id": i,
+        "filepath": filepaths[i],
+    }
+    retriever.insert(data)
+retriever.create_index()
+for query in qs:
+    query_np = query.float().numpy()
+    result = retriever.search(query_np, topk=1)
+    print(filepaths[result[0][1]])
+
+
+