add colqwen stuff and pass ruff

This commit is contained in:
yichuan-w
2025-09-22 22:01:29 +00:00
parent 72455bb269
commit 94d9a203a2
7 changed files with 98815 additions and 99376 deletions

3
.gitignore vendored
View File

@@ -102,3 +102,6 @@ CLAUDE.local.md
.claude/*.local.*
.claude/local/*
benchmarks/data/
## multi vector
apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colpali-native-weaviate.py

View File

@@ -2,7 +2,6 @@ from __future__ import annotations
import sys
from pathlib import Path
from typing import List, Tuple
import numpy as np
@@ -28,8 +27,8 @@ class LeannMultiVector:
index_path: str,
dim: int = 128,
distance_metric: str = "mips",
M: int = 16,
efConstruction: int = 500,
m: int = 16,
ef_construction: int = 500,
is_compact: bool = False,
is_recompute: bool = False,
embedding_model_name: str = "colvision",
@@ -37,15 +36,15 @@ class LeannMultiVector:
self.index_path = index_path
self.dim = dim
self.embedding_model_name = embedding_model_name
self._pending_items: List[dict] = []
self._pending_items: list[dict] = []
self._backend_kwargs = {
"distance_metric": distance_metric,
"M": M,
"efConstruction": efConstruction,
"M": m,
"efConstruction": ef_construction,
"is_compact": is_compact,
"is_recompute": is_recompute,
}
self._labels_meta: List[dict] = []
self._labels_meta: list[dict] = []
def _meta_dict(self) -> dict:
return {
@@ -85,8 +84,8 @@ class LeannMultiVector:
if not self._pending_items:
return
embeddings: List[np.ndarray] = []
labels_meta: List[dict] = []
embeddings: list[np.ndarray] = []
labels_meta: list[dict] = []
for item in self._pending_items:
doc_id = int(item["doc_id"])
@@ -108,12 +107,15 @@ class LeannMultiVector:
return
embeddings_np = np.vstack(embeddings).astype(np.float32)
# print shape of embeddings_np
print(embeddings_np.shape)
builder = HNSWBuilder(**{**self._backend_kwargs, "dimensions": self.dim})
ids = [str(i) for i in range(embeddings_np.shape[0])]
builder.build(embeddings_np, ids, self.index_path)
import json as _json
with open(self._meta_path(), "w", encoding="utf-8") as f:
_json.dump(self._meta_dict(), f, indent=2)
with open(self._labels_path(), "w", encoding="utf-8") as f:
@@ -127,10 +129,13 @@ class LeannMultiVector:
labels_path = self._labels_path()
if labels_path.exists():
import json as _json
with open(labels_path, encoding="utf-8") as f:
self._labels_meta = _json.load(f)
def search(self, data: np.ndarray, topk: int, first_stage_k: int = 50) -> List[Tuple[float, int]]:
def search(
self, data: np.ndarray, topk: int, first_stage_k: int = 50
) -> list[tuple[float, int]]:
if data.ndim == 1:
data = data.reshape(1, -1)
if data.dtype != np.float32:
@@ -175,5 +180,3 @@ class LeannMultiVector:
scores = sorted(((v, k) for k, v in doc_scores.items()), key=lambda x: x[0], reverse=True)
return scores[:topk] if len(scores) >= topk else scores

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,611 +0,0 @@
# %% [markdown]
# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/weaviate-features/multi-vector/multi-vector-colipali-rag.ipynb)
# %% [markdown]
# # Multimodal RAG over PDFs using ColQwen2, Qwen2.5, and Weaviate
#
# This notebook demonstrates [Multimodal Retrieval-Augmented Generation (RAG)](https://weaviate.io/blog/multimodal-rag) over PDF documents.
# We will be performing retrieval against a collection of PDF documents by embedding both the individual pages of the documents and our queries into the same multi-vector space, reducing the problem to approximate nearest-neighbor search on ColBERT-style multi-vector embeddings under the MaxSim similarity measure.
#
# For this purpose, we will use
#
# - **A multimodal [late-interaction model](https://weaviate.io/blog/late-interaction-overview)**, like ColPali and ColQwen2, to generate
# embeddings. This tutorial uses the publicly available model
# [ColQwen2-v1.0](https://huggingface.co/vidore/colqwen2-v1.0) with a permissive Apache 2.0 license.
# - **A Weaviate [vector database](https://weaviate.io/blog/what-is-a-vector-database)**, which has a [multi-vector feature](https://docs.weaviate.io/weaviate/tutorials/multi-vector-embeddings) to effectively index a collection of PDF documents and support textual queries against the contents of the documents, including both text and figures.
# - **A vision language model (VLM)**, specifically [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), to support multimodal Retrieval-Augmented Generation (RAG).
#
# Below, you can see the multimodal RAG system overview:
#
# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/multimodal-rag-diagram.png?raw=1" width="700px"/>
#
# First, the ingestion pipeline processes the PDF documents as images with the multimodal late-interaction model. The multi-vector embeddings are stored in a vector database.
# Then at query time, the text query is processed by the same multimodal late-interaction model to retrieve the relevant documents.
# The retrieved PDF files are then passed as visual context together with the original user query to the vision language model, which generates a response based on this information.
#
# %% [markdown]
# ## Prerequisites
#
# To run this notebook, you will need a machine capable of running neural networks using 5-10 GB of memory.
# The demonstration uses two different vision language models that both require several gigabytes of memory.
# See the documentation for each individual model and the general PyTorch docs to figure out how to best run the models on your hardware.
#
# For example, you can run it on:
#
# - Google Colab (using the free-tier T4 GPU)
# - or locally (tested on an M2 Pro Mac).
#
# Furthermore, you will need an instance of Weaviate version >= `1.29.0`.
#
# %% [markdown]
# ## Step 1: Install required libraries
#
# Let's begin by installing and importing the required libraries.
#
# Note that you'll need Python `3.13`.
# %%
%%capture
%pip install colpali_engine weaviate-client qwen_vl_utils
%pip install -q -U "colpali-engine[interpretability]>=0.3.2,<0.4.0"
# %%
import os
import torch
import numpy as np
from google.colab import userdata
from datasets import load_dataset
from transformers.utils.import_utils import is_flash_attn_2_available
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from colpali_engine.models import ColQwen2, ColQwen2Processor
#from colpali_engine.models import ColPali, ColPaliProcessor # uncomment if you prefer to use ColPali models instead of ColQwen2 models
import weaviate
from weaviate.classes.init import Auth
import weaviate.classes.config as wc
from weaviate.classes.config import Configure
from weaviate.classes.query import MetadataQuery
from qwen_vl_utils import process_vision_info
import base64
from io import BytesIO
import matplotlib.pyplot as plt
from colpali_engine.interpretability import (
get_similarity_maps_from_embeddings,
plot_all_similarity_maps,
plot_similarity_map,
)
# %% [markdown]
# ## Step 2: Load the PDF dataset
#
# Let's start with the data.
# We're going to first load a PDF document dataset of the [top-40 most
# cited AI papers on arXiv](https://arxiv.org/abs/2412.12121) from Hugging Face from the period 2023-01-01 to 2024-09-30.
# %%
dataset = load_dataset("weaviate/arXiv-AI-papers-multi-vector", split="train")
# %%
dataset
# %%
dataset[398]
# %% [markdown]
# Let's take a look at a sample document page from the loaded PDF dataset.
# %%
display(dataset[289]["page_image"])
# %% [markdown]
# ![Retrieved page](./figures/retrieved_page.png)
# %% [markdown]
# ## Step 3: Load the ColVision (ColPali or ColQwen2) model
#
# The approach to generate embeddings for this tutorial is outlined in the paper [ColPali: Efficient Document Retrieval with Vision Language Models](https://arxiv.org/abs/2407.01449). The paper demonstrates that it is possible to simplify traditional approaches to preprocessing PDF documents for retrieval:
#
# Traditional PDF processing in RAG systems involves using OCR (Optical Character Recognition) and layout detection software, and separate processing of text, tables, figures, and charts. Additionally, after text extraction, text processing also requires a chunking step. Instead, the ColPali method feeds images (screenshots) of entire PDF pages to a Vision Language Model that produces a ColBERT-style multi-vector embedding.
#
# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/colipali_pipeline.jpeg?raw=1" width="700px"/>
#
# There are different ColVision models, such as ColPali or ColQwen2, available, which mainly differ in the used encoders (Contextualized Late Interaction over Qwen2 vs. PaliGemma-3B). You can read more about the differences between ColPali and ColQwen2 in our [overview of late-interaction models](https://weaviate.io/blog/late-interaction-overview).
#
# Let's load the [ColQwen2-v1.0](https://huggingface.co/vidore/colqwen2-v1.0) model for this tutorial.
# %%
# Get rid of process forking deadlock warnings.
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# %%
if torch.cuda.is_available(): # If GPU available
device = "cuda:0"
elif torch.backends.mps.is_available(): # If Apple Silicon available
device = "mps"
else:
device = "cpu"
if is_flash_attn_2_available():
attn_implementation = "flash_attention_2"
else:
attn_implementation = "eager"
print(f"Using device: {device}")
print(f"Using attention implementation: {attn_implementation}")
# %%
model_name = "vidore/colqwen2-v1.0"
# About a 5 GB download and similar memory usage.
model = ColQwen2.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map=device,
attn_implementation=attn_implementation,
).eval()
# Load processor
processor = ColQwen2Processor.from_pretrained(model_name)
# %% [markdown]
# This notebook uses the ColQwen2 model because it has a permissive Apache 2.0 license.
# Alternatively, you can also use [ColPali](https://huggingface.co/vidore/colpali-v1.2), which has a Gemma license, or check out other available [ColVision models](https://github.com/illuin-tech/colpali). For a detailed comparison, you can also refer to [ViDoRe: The Visual Document Retrieval Benchmark](https://huggingface.co/spaces/vidore/vidore-leaderboard)
#
# If you want to use ColPali instead of ColQwen2, you can comment out the above code cell and uncomment the code cell below.
# %%
#model_name = "vidore/colpali-v1.2"
# Load model
#colpali_model = ColPali.from_pretrained(
# model_name,
# torch_dtype=torch.bfloat16,
# device_map=device,
# attn_implementation=attn_implementation,
#).eval()
# Load processor
#colpali_processor = ColPaliProcessor.from_pretrained(model_name)
# %% [markdown]
# Before we go further, let's familiarize ourselves with the ColQwen2 model. It can create multi-vector embeddings from both images and text queries. Below you can see examples of each.
#
# %%
# Sample image inputs
images = [
dataset[0]["page_image"],
dataset[1]["page_image"],
]
# Process the inputs
batch_images = processor.process_images(images).to(model.device)
# Forward pass
with torch.no_grad():
query_embedding = model(**batch_images)
print(query_embedding)
print(query_embedding.shape)
# %%
# Sample query inputs
queries = [
"A table with LLM benchmark results.",
"A figure detailing the architecture of a neural network.",
]
# Process the inputs
batch_queries = processor.process_queries(queries).to(model.device)
# Forward pass
with torch.no_grad():
query_embedding = model(**batch_queries)
print(query_embedding)
print(query_embedding.shape)
# %% [markdown]
# Let's write a class to wrap the multimodal late-interaction model and its embedding functionalities for convenience.
#
#
# %%
# A convenience class to wrap the embedding functionality
# of ColVision models like ColPali and ColQwen2
class ColVision:
def __init__(self, model, processor):
"""Initialize with a loaded model and processor."""
self.model = model
self.processor = processor
# A batch size of one appears to be most performant when running on an M4.
# Note: Reducing the image resolution speeds up the vectorizer and produces
# fewer multi-vectors.
def multi_vectorize_image(self, img):
"""Return the multi-vector image of the supplied PIL image."""
image_batch = self.processor.process_images([img]).to(self.model.device)
with torch.no_grad():
image_embedding = self.model(**image_batch)
return image_embedding[0]
def multi_vectorize_text(self, query):
"""Return the multi-vector embedding of the query text string."""
query_batch = self.processor.process_queries([query]).to(self.model.device)
with torch.no_grad():
query_embedding = self.model(**query_batch)
return query_embedding[0]
# Instantiate the model to be used below.
colvision_embedder = ColVision(model, processor) # This will be instantiated after loading the model and processor
# %% [markdown]
# Let's verify that the embedding of images and queries works as intended.
#
# %%
# Sample image inputs
images = dataset[0]["page_image"]
page_embedding = colvision_embedder.multi_vectorize_image(images)
print(page_embedding.shape) # torch.Size([755, 128])
queries = [
"A table with LLM benchmark results.",
"A figure detailing the architecture of a neural network.",
]
query_embeddings = [colvision_embedder.multi_vectorize_text(q) for q in queries]
print(query_embeddings[0].shape) # torch.Size([20, 128])
# %% [markdown]
# ## Step 4: Connect to a Weaviate vector database instance
#
# Now, you will need to connect to a running Weaviate vector database cluster.
#
# You can choose one of the following options:
#
# 1. **Option 1:** You can create a 14-day free sandbox on the managed service [Weaviate Cloud (WCD)](https://console.weaviate.cloud/)
# 2. **Option 2:** [Embedded Weaviate](https://docs.weaviate.io/deploy/installation-guides/embedded)
# 3. **Option 3:** [Local deployment](https://docs.weaviate.io/deploy/installation-guides/docker-installation)
# 4. [Other options](https://docs.weaviate.io/deploy)
# %%
# Option 1: Weaviate Cloud
WCD_URL = os.environ["WEAVIATE_URL"] # Replace with your Weaviate cluster URL
WCD_AUTH_KEY = os.environ["WEAVIATE_API_KEY"] # Replace with your cluster auth key
# Uncomment if you are working in a Google Colab environment
#WCD_URL = userdata.get("WEAVIATE_URL")
#WCD_AUTH_KEY = userdata.get("WEAVIATE_API_KEY")
# Weaviate Cloud Deployment
client = weaviate.connect_to_weaviate_cloud(
cluster_url=WCD_URL,
auth_credentials=weaviate.auth.AuthApiKey(WCD_AUTH_KEY),
)
# Option 2: Embedded Weaviate instance
# use if you want to explore Weaviate without any additional setup
#client = weaviate.connect_to_embedded()
# Option 3: Locally hosted instance of Weaviate via Docker or Kubernetes
#!docker run --detach -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.29.0
#client = weaviate.connect_to_local()
print(client.is_ready())
# %% [markdown]
# For this tutorial, you will need the Weaviate `v1.29.0` or higher.
# Let's make sure we have the required version:
# %%
client.get_meta()['version']
# %% [markdown]
# ## Step 5: Create a collection
#
# Next, we will create a collection that will hold the embeddings of the images of the PDF document pages.
#
# We will not define a built-in vectorizer but use the [Bring Your Own Vectors (BYOV) approach](https://docs.weaviate.io/weaviate/starter-guides/custom-vectors), where we manually embed queries and PDF documents at ingestions and query stage.
#
# Additionally, if you are interested in using the [MUVERA encoding algorithm](https://weaviate.io/blog/muvera) for multi-vector embeddings, you can uncomment it in the code below.
# %%
collection_name = "PDFDocuments"
# %%
# Delete the collection if it already exists
# Note: in practice, you shouldn't rerun this cell, as it deletes your data
# in "PDFDocuments", and then you need to re-import it again.
#if client.collections.exists(collection_name):
# client.collections.delete(collection_name)
# Create a collection
collection = client.collections.create(
name=collection_name,
properties=[
wc.Property(name="page_id", data_type=wc.DataType.INT),
wc.Property(name="dataset_index", data_type=wc.DataType.INT),
wc.Property(name="paper_title", data_type=wc.DataType.TEXT),
wc.Property(name="paper_arxiv_id", data_type=wc.DataType.TEXT),
wc.Property(name="page_number", data_type=wc.DataType.INT),
],
vector_config=[
Configure.MultiVectors.self_provided(
name="colqwen",
#encoding=Configure.VectorIndex.MultiVector.Encoding.muvera(),
vector_index_config=Configure.VectorIndex.hnsw(
multi_vector=Configure.VectorIndex.MultiVector.multi_vector()
)
)]
)
# %% [markdown]
# ## Step 6: Uploading the vectors to Weaviate
#
# In this step, we're indexing the vectors into our Weaviate Collection in batches.
#
# For each batch, the images are processed and encoded using the ColPali model, turning them into multi-vector embeddings.
# These embeddings are then converted from tensors into lists of vectors, capturing key details from each image and creating a multi-vector representation for each document.
# This setup works well with Weaviate's multivector capabilities.
#
# After processing, the vectors and any metadata are uploaded to Weaviate, gradually building up the index.
# You can lower or increase the `batch_size` depending on your available GPU resources.
# %%
# Map of page ids to images to support displaying the image corresponding to a
# particular page id.
page_images = {}
with collection.batch.dynamic() as batch:
for i in range(len(dataset)):
p = dataset[i]
page_images[p["page_id"]] = p["page_image"]
batch.add_object(
properties={
"page_id": p["page_id"],
"paper_title": p["paper_title"],
"paper_arxiv_id": p["paper_arxiv_id"],
"page_number": p["page_number"],
},
vector={"colqwen": colvision_embedder.multi_vectorize_image(p["page_image"]).cpu().float().numpy().tolist()})
if i % 25 == 0:
print(f"Added {i+1}/{len(dataset)} Page objects to Weaviate.")
batch.flush()
# Delete dataset after creating page_images dict to hold the images
del dataset
# %%
len(collection)
# %% [markdown]
# ## Step 7: Multimodal Retrieval Query
#
# As an example of what we are going to build, consider the following actual demo query and resulting PDF page from our collection (nearest neighbor):
#
# - Query: "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
# - Nearest neighbor: "DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model" (arXiv: 2405.04434), Page: 1.
#
# %%
query = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
# %% [markdown]
# By inspecting the first page of the [DeepSeek-V2 paper](https://arxiv.org/abs/2405.04434), we see that it does indeed contain a figure that is relevant for answering our query:
#
# <img src="https://github.com/weaviate/recipes/blob/main/weaviate-features/multi-vector/figures/deepseek_efficiency.jpeg?raw=1" width="700px"/>
# %% [markdown]
# Note: To avoid `OutOfMemoryError` on freely available resources like Google Colab, we will only retrieve a single document. If you have resources with more memory available, you can set the `limit`parameter to a higher value, like e.g., `limit=3` to increase the number of retrieved PDF pages.
# %%
response = collection.query.near_vector(
near_vector=colvision_embedder.multi_vectorize_text(query).cpu().float().numpy(),
target_vector="colqwen",
limit=1,
return_metadata=MetadataQuery(distance=True), # Needed to return MaxSim score
)
print(f"The most relevant documents for the query \"{query}\" by order of relevance:\n")
result_images = []
for i, o in enumerate(response.objects):
p = o.properties
print(
f"{i+1}) MaxSim: {-o.metadata.distance:.2f}, "
+ f"Title: \"{p['paper_title']}\" "
+ f"(arXiv: {p['paper_arxiv_id']}), "
+ f"Page: {int(p['page_number'])}"
)
result_images.append(page_images[p["page_id"]])
# %% [markdown]
# The retrieved page with the highest MaxSim score is indeed the page with the figure we mentioned earlier.
# %%
closest_page_id = response.objects[0].properties['page_id']
image = page_images[closest_page_id]
display(image)
# %% [markdown]
# ![Retrieved page](./figures/retrieved_page.png)
#
# Let's visualize the similarity maps for the retrieved PDF document page to see the semantic similarity between each token in the user query and the image patches. This is an optional step.
# %%
# Preprocess inputs
batch_images = processor.process_images([image]).to(device)
batch_queries = processor.process_queries([query]).to(device)
# Forward passes
with torch.no_grad():
image_embeddings = model.forward(**batch_images)
query_embeddings = model.forward(**batch_queries)
# Get the number of image patches
n_patches = processor.get_n_patches(
image_size=image.size,
spatial_merge_size=model.spatial_merge_size,
)
# Get the tensor mask to filter out the embeddings that are not related to the image
image_mask = processor.get_image_mask(batch_images)
# Generate the similarity maps
batched_similarity_maps = get_similarity_maps_from_embeddings(
image_embeddings=image_embeddings,
query_embeddings=query_embeddings,
n_patches=n_patches,
image_mask=image_mask,
)
# Get the similarity map for our (only) input image
similarity_maps = batched_similarity_maps[0] # (query_length, n_patches_x, n_patches_y)
print(f"Similarity map shape: (query_length, n_patches_x, n_patches_y) = {tuple(similarity_maps.shape)}")
# %%
# Remove the padding tokens and the query augmentation tokens
query_content = processor.decode(batch_queries.input_ids[0])
query_content = query_content.replace(processor.tokenizer.pad_token, "")
query_content = query_content.replace(processor.query_augmentation_token, "").strip()
# Retokenize the cleaned query
query_tokens = processor.tokenizer.tokenize(query_content)
# Use this cell output to choose a token using its index
for idex, val in enumerate(query_tokens):
print(f"{idex}: {val}")
# %% [markdown]
# Let's check the similarity plot for the token "MA" in "LLaMA". (Note that similarity maps are created for each token separately.)
# %%
token_idx = 13
fig, ax = plot_similarity_map(
image=image,
similarity_map=similarity_maps[token_idx],
figsize=(18, 18),
show_colorbar=False,
)
max_sim_score = similarity_maps[token_idx, :, :].max().item()
ax.set_title(f"Token #{token_idx}: `{query_tokens[token_idx]}`. MaxSim score: {max_sim_score:.2f}", fontsize=14)
plt.show()
# %% [markdown]
# ![Similarity map](./figures/similarity_map.png)
# %%
# Delete variables used for visualization
del batched_similarity_maps, similarity_maps, n_patches, query_content, query_tokens, token_idx
# %% [markdown]
# ## Step 8: Extension to Multimodal RAG using Qwen2.5
#
# The above example gives us the most relevant pages to begin looking at to answer our query. Let's extend this multimodal document retrieval pipeline to a multimodal RAG pipeline.
#
# Vision language models (VLMs) are Large Language Models with vision capabilities. They are now powerful enough that we can give the query and relevant pages to such a model and have it produce an answer to our query in plain text.
#
# To accomplish this we are going to feed the top results into the
# state-of-the-art VLM [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct).
# %%
# Setting up Qwen2.5-VL-3B-Instruct for generating answers from a query string
# plus a collection of (images of) PDF pages.
class QwenVL:
def __init__(self):
# Adjust the settings to your available architecture, see the link
# https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct for examples.
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-3B-Instruct",
torch_dtype=torch.bfloat16,
device_map=device,
attn_implementation=attn_implementation,
)
min_pixels = 256*28*28
max_pixels = 1280*28*28
self.processor = AutoProcessor.from_pretrained(
"Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
def query_images(self, query, images):
"""Generate a textual response to the query (text) based on the information in the supplied list of PIL images."""
# Preparation for inference.
# Convert the images to base64 strings.
content = []
for img in images:
buffer = BytesIO()
img.save(buffer, format="jpeg")
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
content.append({"type": "image", "image": f"data:image;base64,{img_base64}"})
content.append({"type": "text", "text": query})
messages = [{"role": "user", "content": content}]
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(device)
# Inference: Generation of the output.
generated_ids = self.model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
return self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Instantiate the model to be used below.
qwenvl = QwenVL()
# %% [markdown]
# The response from `Qwen2.5-VL-3B-Instruct` based on the retrieved PDF pages:
# %%
qwenvl.query_images(query, result_images)
# %% [markdown]
# As you can see, the multimodal RAG pipeline was able to answer the original query: "How does DeepSeek-V2 compare against the LLaMA family of LLMs?". For this, the ColQwen2 retrieval model retrieved the correct PDF page from the
# "DeepSeek-V2: A Strong Economical and Efficient Mixture-of-Experts Language Model" paper and used both the text and visual from the retrieved PDF page to answer the question.
# %% [markdown]
# ## Summary
#
# This notebook demonstrates a multimodal RAG pipeline over PDF documents using ColQwen2 for multi-vector embeddings, a Weaviate vector database for storage and retrieval, and Qwen2.5-VL-3B-Instruct for generating answers.
# %% [markdown]
# ## References
#
# - Faysse, M., Sibille, H., Wu, T., Omrani, B., Viaud, G., Hudelot, C., Colombo, P. (2024). ColPali: Efficient Document Retrieval with Vision Language Models. arXiv. https://doi.org/10.48550/arXiv.2407.01449
# - [ColPali GitHub repository](https://github.com/illuin-tech/colpali)
# - [ColPali Cookbook](https://github.com/tonywu71/colpali-cookbooks)

View File

@@ -1,13 +1,12 @@
## Jupyter-style notebook script
#%%
# %%
# uv pip install matplotlib qwen_vl_utils
import os
import re
import sys
from pathlib import Path
from typing import List, Optional, Tuple, cast, Any
from typing import Any, Optional, cast
import numpy as np
from PIL import Image
from tqdm import tqdm
@@ -27,7 +26,7 @@ _ensure_repo_paths_importable(__file__)
from leann_multi_vector import LeannMultiVector # noqa: E402
#%%
# %%
# Config
os.environ["TOKENIZERS_PARALLELISM"] = "false"
QUERY = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"
@@ -46,26 +45,26 @@ PAGES_DIR: str = "./pages"
# Index + retrieval settings
INDEX_PATH: str = "./indexes/colvision.leann"
TOPK: int = 1
FIRST_STAGE_K: int = 50
FIRST_STAGE_K: int = 500
REBUILD_INDEX: bool = False
# Artifacts
SAVE_TOP_IMAGE: Optional[str] = "./figures/retrieved_page.png"
SIMILARITY_MAP: bool = True
SIM_TOKEN_IDX: int = -1 # -1 means auto-select the most salient token
SIM_TOKEN_IDX: int = 13 # -1 means auto-select the most salient token
SIM_OUTPUT: str = "./figures/similarity_map.png"
ANSWER: bool = True
MAX_NEW_TOKENS: int = 128
#%%
# %%
# Helpers
def _natural_sort_key(name: str) -> int:
m = re.search(r"\d+", name)
return int(m.group()) if m else 0
def _load_images_from_dir(pages_dir: str) -> Tuple[List[str], List[Image.Image]]:
def _load_images_from_dir(pages_dir: str) -> tuple[list[str], list[Image.Image]]:
filenames = [n for n in os.listdir(pages_dir) if n.lower().endswith((".png", ".jpg", ".jpeg"))]
filenames = sorted(filenames, key=_natural_sort_key)
filepaths = [os.path.join(pages_dir, n) for n in filenames]
@@ -80,7 +79,9 @@ def _maybe_convert_pdf_to_images(pdf_path: Optional[str], pages_dir: str, dpi: i
try:
from pdf2image import convert_from_path
except Exception as e:
raise RuntimeError("pdf2image is required to convert PDF to images. Install via pip install pdf2image") from e
raise RuntimeError(
"pdf2image is required to convert PDF to images. Install via pip install pdf2image"
) from e
images = convert_from_path(pdf_path, dpi=dpi)
for i, image in enumerate(images):
image.save(os.path.join(pages_dir, f"page_{i + 1}.png"), "PNG")
@@ -93,7 +94,11 @@ def _select_device_and_dtype():
device_str = (
"cuda"
if torch.cuda.is_available()
else ("mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu")
else (
"mps"
if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
else "cpu"
)
)
device = get_torch_device(device_str)
# Stable dtype selection to avoid NaNs:
@@ -115,17 +120,20 @@ def _select_device_and_dtype():
def _load_colvision(model_choice: str):
import torch
from colpali_engine.models import ColPali
from colpali_engine.models import ColPali, ColQwen2, ColQwen2Processor
from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
from transformers.utils.import_utils import is_flash_attn_2_available
from colpali_engine.models import ColQwen2, ColQwen2Processor
device_str, device, dtype = _select_device_and_dtype()
if model_choice == "colqwen2":
model_name = "vidore/colqwen2-v1.0"
# On CPU/MPS we must avoid flash-attn and stay eager; on CUDA prefer flash-attn if available
attn_implementation = "flash_attention_2" if (device_str == "cuda" and is_flash_attn_2_available()) else "eager"
attn_implementation = (
"flash_attention_2"
if (device_str == "cuda" and is_flash_attn_2_available())
else "eager"
)
model = ColQwen2.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
@@ -145,7 +153,7 @@ def _load_colvision(model_choice: str):
return model_name, model, processor, device_str, device, dtype
def _embed_images(model, processor, images: List[Image.Image]) -> List[Any]:
def _embed_images(model, processor, images: list[Image.Image]) -> list[Any]:
import torch
from colpali_engine.utils.torch_utils import ListDataset
from torch.utils.data import DataLoader
@@ -160,13 +168,16 @@ def _embed_images(model, processor, images: List[Image.Image]) -> List[Any]:
collate_fn=lambda x: processor.process_images(x),
)
doc_vecs: List[Any] = []
doc_vecs: list[Any] = []
for batch_doc in dataloader:
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
# autocast on CUDA for bf16/fp16; on CPU/MPS stay in fp32
if model.device.type == "cuda":
with torch.autocast(device_type="cuda", dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16):
with torch.autocast(
device_type="cuda",
dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
):
embeddings_doc = model(**batch_doc)
else:
embeddings_doc = model(**batch_doc)
@@ -174,7 +185,7 @@ def _embed_images(model, processor, images: List[Image.Image]) -> List[Any]:
return doc_vecs
def _embed_queries(model, processor, queries: List[str]) -> List[Any]:
def _embed_queries(model, processor, queries: list[str]) -> list[Any]:
import torch
from colpali_engine.utils.torch_utils import ListDataset
from torch.utils.data import DataLoader
@@ -188,12 +199,15 @@ def _embed_queries(model, processor, queries: List[str]) -> List[Any]:
collate_fn=lambda x: processor.process_queries(x),
)
q_vecs: List[Any] = []
q_vecs: list[Any] = []
for batch_query in dataloader:
with torch.no_grad():
batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
if model.device.type == "cuda":
with torch.autocast(device_type="cuda", dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16):
with torch.autocast(
device_type="cuda",
dtype=model.dtype if model.dtype.is_floating_point else torch.bfloat16,
):
embeddings_query = model(**batch_query)
else:
embeddings_query = model(**batch_query)
@@ -201,7 +215,7 @@ def _embed_queries(model, processor, queries: List[str]) -> List[Any]:
return q_vecs
def _build_index(index_path: str, doc_vecs: List[Any], filepaths: List[str]) -> LeannMultiVector:
def _build_index(index_path: str, doc_vecs: list[Any], filepaths: list[str]) -> LeannMultiVector:
dim = int(doc_vecs[0].shape[-1])
retriever = LeannMultiVector(index_path=index_path, dim=dim)
retriever.create_collection()
@@ -233,7 +247,7 @@ def _generate_similarity_map(
query: str,
token_idx: Optional[int] = None,
output_path: Optional[str] = None,
) -> Tuple[int, float]:
) -> tuple[int, float]:
import torch
from colpali_engine.interpretability import (
get_similarity_maps_from_embeddings,
@@ -288,8 +302,8 @@ def _generate_similarity_map(
class QwenVL:
def __init__(self, device: str):
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from transformers.utils.import_utils import is_flash_attn_2_available
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "eager"
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -305,11 +319,12 @@ class QwenVL:
"Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
)
def answer(self, query: str, images: List[Image.Image], max_new_tokens: int = 128) -> str:
from qwen_vl_utils import process_vision_info
def answer(self, query: str, images: list[Image.Image], max_new_tokens: int = 128) -> str:
import base64
from io import BytesIO
from qwen_vl_utils import process_vision_info
content = []
for img in images:
buffer = BytesIO()
@@ -319,17 +334,25 @@ class QwenVL:
content.append({"type": "text", "text": query})
messages = [{"role": "user", "content": content}]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
inputs = self.processor(
text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt"
)
inputs = inputs.to(self.model.device)
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
return self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
#%%
# %%
# Step 1: Prepare data
if USE_HF_DATASET:
@@ -337,32 +360,33 @@ if USE_HF_DATASET:
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset))
filepaths: List[str] = []
images: List[Image.Image] = []
filepaths: list[str] = []
images: list[Image.Image] = []
for i in tqdm(range(N), desc="Loading dataset"):
p = dataset[i]
# Compose a descriptive identifier for printing later
identifier = (
f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
)
identifier = f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}"
print(identifier)
filepaths.append(identifier)
images.append(p["page_image"]) # PIL Image
else:
_maybe_convert_pdf_to_images(PDF, PAGES_DIR)
filepaths, images = _load_images_from_dir(PAGES_DIR)
if not images:
raise RuntimeError(f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist.")
raise RuntimeError(
f"No images found in {PAGES_DIR}. Provide PDF path in PDF variable or ensure images exist."
)
#%%
# %%
# Step 2: Load model and processor
model_name, model, processor, device_str, device, dtype = _load_colvision(MODEL)
print(f"Using model={model_name}, device={device_str}, dtype={dtype}")
#%%
# %%
#%%
# %%
# Step 3: Build or load index
retriever: Optional[LeannMultiVector] = None
if not REBUILD_INDEX:
@@ -377,8 +401,7 @@ if retriever is None:
retriever = _build_index(INDEX_PATH, doc_vecs, filepaths)
#%%
# %%
# Step 4: Embed query and search
q_vec = _embed_queries(model, processor, [QUERY])[0]
results = retriever.search(q_vec.float().numpy(), topk=TOPK, first_stage_k=FIRST_STAGE_K)
@@ -386,7 +409,7 @@ if not results:
print("No results found.")
else:
print(f'Top {len(results)} results for query: "{QUERY}"')
top_images: List[Image.Image] = []
top_images: list[Image.Image] = []
for rank, (score, doc_id) in enumerate(results, start=1):
path = filepaths[doc_id]
# For HF dataset, path is a descriptive identifier, not a real file path
@@ -395,9 +418,10 @@ else:
if SAVE_TOP_IMAGE:
from pathlib import Path as _Path
base = _Path(SAVE_TOP_IMAGE)
base.parent.mkdir(parents=True, exist_ok=True)
for rank, img in enumerate(top_images[: TOPK], start=1):
for rank, img in enumerate(top_images[:TOPK], start=1):
if base.suffix:
out_path = base.parent / f"{base.stem}_rank{rank}{base.suffix}"
else:
@@ -405,14 +429,16 @@ else:
img.save(str(out_path))
print(f"Saved retrieved page (rank {rank}) to: {out_path}")
## TODO stange results of second page of DeepSeek-V2 rather than the first page
#%%
# %%
# Step 5: Similarity maps for top-K results
if results and SIMILARITY_MAP:
token_idx = None if SIM_TOKEN_IDX < 0 else int(SIM_TOKEN_IDX)
from pathlib import Path as _Path
output_base = _Path(SIM_OUTPUT) if SIM_OUTPUT else None
for rank, img in enumerate(top_images[: TOPK], start=1):
for rank, img in enumerate(top_images[:TOPK], start=1):
if output_base:
if output_base.suffix:
out_dir = output_base.parent
@@ -433,17 +459,19 @@ if results and SIMILARITY_MAP:
output_path=out_path,
)
if out_path:
print(f"Saved similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f}) to: {out_path}")
print(
f"Saved similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f}) to: {out_path}"
)
else:
print(f"Computed similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f})")
print(
f"Computed similarity map for rank {rank}, token #{chosen_idx} (max={max_sim:.2f})"
)
#%%
# %%
# Step 6: Optional answer generation
if results and ANSWER:
qwen = QwenVL(device=device_str)
response = qwen.answer(QUERY, top_images[: TOPK], max_new_tokens=MAX_NEW_TOKENS)
response = qwen.answer(QUERY, top_images[:TOPK], max_new_tokens=MAX_NEW_TOKENS)
print("\nAnswer:")
print(response)

View File

@@ -4,7 +4,7 @@
# pip install tqdm
# pip install pillow
#%%
# %%
from pdf2image import convert_from_path
pdf_path = "pdfs/2004.12832v2.pdf"
@@ -13,9 +13,7 @@ images = convert_from_path(pdf_path)
for i, image in enumerate(images):
image.save(f"pages/page_{i + 1}.png", "PNG")
#%%
import numpy as np
import concurrent.futures
# %%
import os
from pathlib import Path
@@ -24,6 +22,7 @@ _repo_root = Path(__file__).resolve().parents[3]
_leann_core_src = _repo_root / "packages" / "leann-core" / "src"
_leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
import sys
if str(_leann_core_src) not in sys.path:
sys.path.append(str(_leann_core_src))
if str(_leann_hnsw_pkg) not in sys.path:
@@ -34,26 +33,30 @@ from leann_multi_vector import LeannMultiVector
class LeannRetriever(LeannMultiVector):
pass
#%%
# %%
from typing import cast
import torch
from colpali_engine.models import ColPali
from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor
from colpali_engine.utils.torch_utils import ListDataset, get_torch_device
from torch.utils.data import DataLoader
import torch
from typing import List, cast
# Auto-select device: CUDA > MPS (mac) > CPU
_device_str = (
"cuda" if torch.cuda.is_available() else (
"mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu"
"cuda"
if torch.cuda.is_available()
else (
"mps"
if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
else "cpu"
)
)
device = get_torch_device(_device_str)
# Prefer fp16 on GPU/MPS, bfloat16 on CPU
_dtype = (
torch.float16 if _device_str in ("cuda", "mps") else torch.bfloat16
)
_dtype = torch.float16 if _device_str in ("cuda", "mps") else torch.bfloat16
model_name = "vidore/colpali-v1.2"
model = ColPali.from_pretrained(
@@ -77,21 +80,21 @@ dataloader = DataLoader(
collate_fn=lambda x: processor.process_queries(x),
)
qs: List[torch.Tensor] = []
qs: list[torch.Tensor] = []
for batch_query in dataloader:
with torch.no_grad():
batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
embeddings_query = model(**batch_query)
qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
print(qs[0].shape)
#%%
# %%
from tqdm import tqdm
from PIL import Image
import os
import re
from PIL import Image
from tqdm import tqdm
page_filenames = sorted(os.listdir("./pages"), key=lambda n: int(re.search(r"\d+", n).group()))
images = [Image.open(os.path.join("./pages", name)) for name in page_filenames]
@@ -102,7 +105,7 @@ dataloader = DataLoader(
collate_fn=lambda x: processor.process_images(x),
)
ds: List[torch.Tensor] = []
ds: list[torch.Tensor] = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
@@ -111,7 +114,7 @@ for batch_doc in tqdm(dataloader):
print(ds[0].shape)
#%%
# %%
# Build HNSW index via LeannRetriever primitives and run search
index_path = "./indexes/colpali.leann"
retriever = LeannRetriever(index_path=index_path, dim=int(ds[0].shape[-1]))
@@ -129,6 +132,3 @@ for query in qs:
query_np = query.float().numpy()
result = retriever.search(query_np, topk=1)
print(filepaths[result[0][1]])

View File

@@ -104,7 +104,11 @@ astchunk = { path = "packages/astchunk-leann", editable = true }
[tool.ruff]
target-version = "py39"
line-length = 100
extend-exclude = ["third_party"]
extend-exclude = [
"third_party",
"apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py",
"apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py"
]
[tool.ruff.lint]