From 576beb13db6ae3509275636ce23241f2572883de Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Tue, 23 Sep 2025 23:21:03 -0700 Subject: [PATCH] add doc about multimodal --- .gitignore | 5 + .../vision-based-pdf-multi-vector/README.md | 113 ++++++++++++++++++ ...py => multi-vector-leann-paper-example.py} | 36 ++---- .../multi-vector-leann-similarity-map.py | 6 +- packages/leann-backend-hnsw/third_party/faiss | 2 +- pyproject.toml | 2 +- 6 files changed, 130 insertions(+), 34 deletions(-) create mode 100644 apps/multimodal/vision-based-pdf-multi-vector/README.md rename apps/multimodal/vision-based-pdf-multi-vector/{multi-vector-leann.py => multi-vector-leann-paper-example.py} (89%) diff --git a/.gitignore b/.gitignore index 575798d..b774e4d 100755 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,8 @@ benchmarks/data/ ## multi vector apps/multimodal/vision-based-pdf-multi-vector/multi-vector-colpali-native-weaviate.py + +# Ignore all PDFs (keep data exceptions above) and do not track demo PDFs +# If you need to commit a specific demo PDF, remove this negation locally. +# The following line used to force-add a large demo PDF; remove it to satisfy pre-commit: +# !apps/multimodal/vision-based-pdf-multi-vector/pdfs/2004.12832v2.pdf diff --git a/apps/multimodal/vision-based-pdf-multi-vector/README.md b/apps/multimodal/vision-based-pdf-multi-vector/README.md new file mode 100644 index 0000000..652f954 --- /dev/null +++ b/apps/multimodal/vision-based-pdf-multi-vector/README.md @@ -0,0 +1,113 @@ +## Vision-based PDF Multi-Vector Demos (macOS/MPS) + +This folder contains two demos to index PDF pages as images and run multi-vector retrieval with ColPali/ColQwen2, plus optional similarity map visualization and answer generation. + +### What you’ll run +- `multi-vector-leann-paper-example.py`: local PDF → pages → embed → build HNSW index → search. +- `multi-vector-leann-similarity-map.py`: HF dataset (default) or local pages → embed → index → retrieve → similarity maps → optional Qwen-VL answer. + +## Prerequisites (macOS) + +### 1) Homebrew poppler (for pdf2image) +```bash +brew install poppler +which pdfinfo && pdfinfo -v +``` + +### 2) Python environment +Use uv (recommended) or pip. Python 3.9+. + +Using uv: +```bash +uv pip install \ + colpali_engine \ + pdf2image \ + pillow \ + matplotlib qwen_vl_utils \ + einops \ + seaborn +``` + +Notes: +- On first run, models download from Hugging Face. Login/config if needed. +- The scripts auto-select device: CUDA > MPS > CPU. Verify MPS: +```bash +python -c "import torch; print('MPS available:', bool(getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available()))" +``` + +## Run the demos + +### A) Local PDF example +Converts a local PDF into page images, embeds them, builds an index, and searches. + +```bash +cd apps/multimodal/vision-based-pdf-multi-vector +# If you don't have the sample PDF locally, download it (ignored by Git) +mkdir -p pdfs +curl -L -o pdfs/2004.12832v2.pdf https://arxiv.org/pdf/2004.12832.pdf +ls pdfs/2004.12832v2.pdf +# Ensure output dir exists +mkdir -p pages +python multi-vector-leann-paper-example.py +``` +Expected: +- Page images in `pages/`. +- Console prints like `Using device=mps, dtype=...` and retrieved file paths for queries. + +To use your own PDF: edit `pdf_path` near the top of the script. + +### B) Similarity map + answer demo +Uses HF dataset `weaviate/arXiv-AI-papers-multi-vector` by default; can switch to local pages. + +```bash +cd apps/multimodal/vision-based-pdf-multi-vector +python multi-vector-leann-similarity-map.py +``` +Artifacts (when enabled): +- Retrieved pages: `./figures/retrieved_page_rank{K}.png` +- Similarity maps: `./figures/similarity_map_rank{K}.png` + +Key knobs in the script (top of file): +- `QUERY`: your question +- `MODEL`: `"colqwen2"` or `"colpali"` +- `USE_HF_DATASET`: set `False` to use local pages +- `PDF`, `PAGES_DIR`: for local mode +- `INDEX_PATH`, `TOPK`, `FIRST_STAGE_K`, `REBUILD_INDEX` +- `SIMILARITY_MAP`, `SIM_TOKEN_IDX`, `SIM_OUTPUT` +- `ANSWER`, `MAX_NEW_TOKENS` (Qwen-VL) + +## Troubleshooting +- pdf2image errors on macOS: ensure `brew install poppler` and `pdfinfo` works in terminal. +- Slow or OOM on MPS: reduce dataset size (e.g., set `MAX_DOCS`) or switch to CPU. +- NaNs on MPS: keep fp32 on MPS (default in similarity-map script); avoid fp16 there. +- First-run model downloads can be large; ensure network access (HF mirrors if needed). + +## Notes +- Index files are under `./indexes/`. Delete or set `REBUILD_INDEX=True` to rebuild. +- For local PDFs, page images go to `./pages/`. + + +### Retrieval and Visualization Example + +Example settings in `multi-vector-leann-similarity-map.py`: +- `QUERY = "How does DeepSeek-V2 compare against the LLaMA family of LLMs?"` +- `SIMILARITY_MAP = True` (to generate heatmaps) +- `TOPK = 1` (save the top retrieved page and its similarity map) + +Run: +```bash +cd apps/multimodal/vision-based-pdf-multi-vector +python multi-vector-leann-similarity-map.py +``` + +Outputs (by default): +- Retrieved page: `./figures/retrieved_page_rank1.png` +- Similarity map: `./figures/similarity_map_rank1.png` + +Sample visualization (example result, and the query is "QUERY = "How does Vim model performance and efficiency compared to other models?" +"): +![Similarity map example](fig/image.png) + +Notes: +- Set `SIM_TOKEN_IDX` to visualize a specific token index; set `-1` to auto-select the most salient token. +- If you change `SIM_OUTPUT` to a file path (e.g., `./figures/my_map.png`), multiple ranks are saved as `my_map_rank{K}.png`. diff --git a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-paper-example.py similarity index 89% rename from apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py rename to apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-paper-example.py index f022d98..22102d3 100644 --- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py +++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-paper-example.py @@ -4,39 +4,24 @@ # pip install tqdm # pip install pillow -# %% -from pdf2image import convert_from_path - -pdf_path = "pdfs/2004.12832v2.pdf" -images = convert_from_path(pdf_path) - -for i, image in enumerate(images): - image.save(f"pages/page_{i + 1}.png", "PNG") - -# %% import os +import re +import sys from pathlib import Path +from typing import cast -# Make local leann packages importable without installing +from PIL import Image +from tqdm import tqdm + +# Ensure local leann packages are importable before importing them _repo_root = Path(__file__).resolve().parents[3] _leann_core_src = _repo_root / "packages" / "leann-core" / "src" _leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw" -import sys - if str(_leann_core_src) not in sys.path: sys.path.append(str(_leann_core_src)) if str(_leann_hnsw_pkg) not in sys.path: sys.path.append(str(_leann_hnsw_pkg)) -from leann_multi_vector import LeannMultiVector - - -class LeannRetriever(LeannMultiVector): - pass - - -# %% -from typing import cast import torch from colpali_engine.models import ColPali @@ -88,13 +73,6 @@ for batch_query in dataloader: qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) print(qs[0].shape) # %% - - -import re - -from PIL import Image -from tqdm import tqdm - page_filenames = sorted(os.listdir("./pages"), key=lambda n: int(re.search(r"\d+", n).group())) images = [Image.open(os.path.join("./pages", name)) for name in page_filenames] diff --git a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py index e1e511d..3bf4dfa 100644 --- a/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py +++ b/apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py @@ -169,7 +169,7 @@ def _embed_images(model, processor, images: list[Image.Image]) -> list[Any]: ) doc_vecs: list[Any] = [] - for batch_doc in dataloader: + for batch_doc in tqdm(dataloader, desc="Embedding images"): with torch.no_grad(): batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()} # autocast on CUDA for bf16/fp16; on CPU/MPS stay in fp32 @@ -200,7 +200,7 @@ def _embed_queries(model, processor, queries: list[str]) -> list[Any]: ) q_vecs: list[Any] = [] - for batch_query in dataloader: + for batch_query in tqdm(dataloader, desc="Embedding queries"): with torch.no_grad(): batch_query = {k: v.to(model.device) for k, v in batch_query.items()} if model.device.type == "cuda": @@ -362,7 +362,7 @@ if USE_HF_DATASET: N = len(dataset) if MAX_DOCS is None else min(MAX_DOCS, len(dataset)) filepaths: list[str] = [] images: list[Image.Image] = [] - for i in tqdm(range(N), desc="Loading dataset"): + for i in tqdm(range(N), desc="Loading dataset", total=N ): p = dataset[i] # Compose a descriptive identifier for printing later identifier = f"arXiv:{p['paper_arxiv_id']}|title:{p['paper_title']}|page:{int(p['page_number'])}|id:{p['page_id']}" diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index 1d51f0c..ed96ff7 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit 1d51f0c07420808a18f85a4db6636fd25e4a1daa +Subproject commit ed96ff7dbaea0562b994f8ce7823af41884b1010 diff --git a/pyproject.toml b/pyproject.toml index 2d29cad..37d9ac6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ target-version = "py39" line-length = 100 extend-exclude = [ "third_party", - "apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann.py", + "apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-paper-example.py", "apps/multimodal/vision-based-pdf-multi-vector/multi-vector-leann-similarity-map.py" ]