Compare commits
4 Commits
fix/drop-p
...
feature/cu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9996c29618 | ||
|
|
12951ad4d5 | ||
|
|
a878d2459b | ||
|
|
6c39a3427f |
85
.github/workflows/build-reusable.yml
vendored
85
.github/workflows/build-reusable.yml
vendored
@@ -35,8 +35,8 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# Note: Python 3.9 dropped - uses PEP 604 union syntax (str | None)
|
- os: ubuntu-22.04
|
||||||
# which requires Python 3.10+
|
python: '3.9'
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-22.04
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: ubuntu-22.04
|
- os: ubuntu-22.04
|
||||||
@@ -46,6 +46,8 @@ jobs:
|
|||||||
- os: ubuntu-22.04
|
- os: ubuntu-22.04
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
# ARM64 Linux builds
|
# ARM64 Linux builds
|
||||||
|
- os: ubuntu-24.04-arm
|
||||||
|
python: '3.9'
|
||||||
- os: ubuntu-24.04-arm
|
- os: ubuntu-24.04-arm
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: ubuntu-24.04-arm
|
- os: ubuntu-24.04-arm
|
||||||
@@ -54,6 +56,8 @@ jobs:
|
|||||||
python: '3.12'
|
python: '3.12'
|
||||||
- os: ubuntu-24.04-arm
|
- os: ubuntu-24.04-arm
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
|
- os: macos-14
|
||||||
|
python: '3.9'
|
||||||
- os: macos-14
|
- os: macos-14
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: macos-14
|
- os: macos-14
|
||||||
@@ -62,6 +66,8 @@ jobs:
|
|||||||
python: '3.12'
|
python: '3.12'
|
||||||
- os: macos-14
|
- os: macos-14
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
|
- os: macos-15
|
||||||
|
python: '3.9'
|
||||||
- os: macos-15
|
- os: macos-15
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: macos-15
|
- os: macos-15
|
||||||
@@ -70,24 +76,16 @@ jobs:
|
|||||||
python: '3.12'
|
python: '3.12'
|
||||||
- os: macos-15
|
- os: macos-15
|
||||||
python: '3.13'
|
python: '3.13'
|
||||||
# Intel Mac builds (x86_64) - replaces deprecated macos-13
|
- os: macos-13
|
||||||
# Note: Python 3.13 excluded - PyTorch has no wheels for macOS x86_64 + Python 3.13
|
python: '3.9'
|
||||||
# (PyTorch <=2.4.1 lacks cp313, PyTorch >=2.5.0 dropped Intel Mac support)
|
- os: macos-13
|
||||||
- os: macos-15-intel
|
|
||||||
python: '3.10'
|
python: '3.10'
|
||||||
- os: macos-15-intel
|
- os: macos-13
|
||||||
python: '3.11'
|
python: '3.11'
|
||||||
- os: macos-15-intel
|
- os: macos-13
|
||||||
python: '3.12'
|
python: '3.12'
|
||||||
# macOS 26 (beta) - arm64
|
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
|
||||||
- os: macos-26
|
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
|
||||||
python: '3.10'
|
|
||||||
- os: macos-26
|
|
||||||
python: '3.11'
|
|
||||||
- os: macos-26
|
|
||||||
python: '3.12'
|
|
||||||
- os: macos-26
|
|
||||||
python: '3.13'
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -206,16 +204,13 @@ jobs:
|
|||||||
# Use system clang for better compatibility
|
# Use system clang for better compatibility
|
||||||
export CC=clang
|
export CC=clang
|
||||||
export CXX=clang++
|
export CXX=clang++
|
||||||
# Set deployment target based on runner
|
# Homebrew libraries on each macOS version require matching minimum version
|
||||||
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
|
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||||
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
|
export MACOSX_DEPLOYMENT_TARGET=13.0
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||||
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=26.0
|
|
||||||
fi
|
fi
|
||||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
else
|
else
|
||||||
@@ -229,16 +224,14 @@ jobs:
|
|||||||
# Use system clang for better compatibility
|
# Use system clang for better compatibility
|
||||||
export CC=clang
|
export CC=clang
|
||||||
export CXX=clang++
|
export CXX=clang++
|
||||||
# Set deployment target based on runner
|
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||||||
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
|
# But Homebrew libraries on each macOS version require matching minimum version
|
||||||
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
|
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||||
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=26.0
|
|
||||||
fi
|
fi
|
||||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||||
else
|
else
|
||||||
@@ -276,19 +269,16 @@ jobs:
|
|||||||
if: runner.os == 'macOS'
|
if: runner.os == 'macOS'
|
||||||
run: |
|
run: |
|
||||||
# Determine deployment target based on runner OS
|
# Determine deployment target based on runner OS
|
||||||
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
|
# Must match the Homebrew libraries for each macOS version
|
||||||
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
|
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||||
HNSW_TARGET="15.0"
|
HNSW_TARGET="13.0"
|
||||||
DISKANN_TARGET="15.0"
|
DISKANN_TARGET="13.3"
|
||||||
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||||
HNSW_TARGET="14.0"
|
HNSW_TARGET="14.0"
|
||||||
DISKANN_TARGET="14.0"
|
DISKANN_TARGET="14.0"
|
||||||
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||||
HNSW_TARGET="15.0"
|
HNSW_TARGET="15.0"
|
||||||
DISKANN_TARGET="15.0"
|
DISKANN_TARGET="15.0"
|
||||||
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
|
|
||||||
HNSW_TARGET="26.0"
|
|
||||||
DISKANN_TARGET="26.0"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Repair HNSW wheel
|
# Repair HNSW wheel
|
||||||
@@ -344,15 +334,12 @@ jobs:
|
|||||||
PY_TAG=$($UV_PY -c "import sys; print(f'cp{sys.version_info[0]}{sys.version_info[1]}')")
|
PY_TAG=$($UV_PY -c "import sys; print(f'cp{sys.version_info[0]}{sys.version_info[1]}')")
|
||||||
|
|
||||||
if [[ "$RUNNER_OS" == "macOS" ]]; then
|
if [[ "$RUNNER_OS" == "macOS" ]]; then
|
||||||
# macos-15-intel runs macOS 15, so target 15.0 (system libraries require it)
|
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||||
if [[ "${{ matrix.os }}" == "macos-15-intel" ]]; then
|
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||||
elif [[ "${{ matrix.os }}" == macos-14* ]]; then
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-15* ]]; then
|
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||||
elif [[ "${{ matrix.os }}" == macos-26* ]]; then
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET=26.0
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
50
README.md
50
README.md
@@ -36,7 +36,7 @@ LEANN is an innovative vector database that democratizes personal AI. Transform
|
|||||||
|
|
||||||
LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
|
LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
|
||||||
|
|
||||||
**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can semantic search your **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)** ([WeChat](#-wechat-detective-unlock-your-golden-memories), [iMessage](#-imessage-history-your-personal-conversation-archive)), **[agent memory](#-chatgpt-chat-history-your-personal-ai-conversation-archive)** ([ChatGPT](#-chatgpt-chat-history-your-personal-ai-conversation-archive), [Claude](#-claude-chat-history-your-personal-ai-conversation-archive)), **[live data](#mcp-integration-rag-on-live-data-from-any-platform)** ([Slack](#slack-messages-search-your-team-conversations), [Twitter](#-twitter-bookmarks-your-personal-tweet-library)), **[codebase](#-claude-code-integration-transform-your-development-workflow)**\* , or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
|
**Ready to RAG Everything?** Transform your laptop into a personal AI assistant that can semantic search your **[file system](#-personal-data-manager-process-any-documents-pdf-txt-md)**, **[emails](#-your-personal-email-secretary-rag-on-apple-mail)**, **[browser history](#-time-machine-for-the-web-rag-your-entire-browser-history)**, **[chat history](#-wechat-detective-unlock-your-golden-memories)** ([WeChat](#-wechat-detective-unlock-your-golden-memories), [iMessage](#-imessage-history-your-personal-conversation-archive)), **[agent memory](#-chatgpt-chat-history-your-personal-ai-conversation-archive)** ([ChatGPT](#-chatgpt-chat-history-your-personal-ai-conversation-archive), [Claude](#-claude-chat-history-your-personal-ai-conversation-archive)), **[live data](#mcp-integration-rag-on-live-data-from-any-platform)** ([Slack](#mcp-integration-rag-on-live-data-from-any-platform), [Twitter](#mcp-integration-rag-on-live-data-from-any-platform)), **[codebase](#-claude-code-integration-transform-your-development-workflow)**\* , or external knowledge bases (i.e., 60M documents) - all on your laptop, with zero cloud costs and complete privacy.
|
||||||
|
|
||||||
|
|
||||||
\* Claude Code only supports basic `grep`-style keyword search. **LEANN** is a drop-in **semantic search MCP service fully compatible with Claude Code**, unlocking intelligent retrieval without changing your workflow. 🔥 Check out [the easy setup →](packages/leann-mcp/README.md)
|
\* Claude Code only supports basic `grep`-style keyword search. **LEANN** is a drop-in **semantic search MCP service fully compatible with Claude Code**, unlocking intelligent retrieval without changing your workflow. 🔥 Check out [the easy setup →](packages/leann-mcp/README.md)
|
||||||
@@ -392,54 +392,6 @@ python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authenticat
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
### 🎨 ColQwen: Multimodal PDF Retrieval with Vision-Language Models
|
|
||||||
|
|
||||||
Search through PDFs using both text and visual understanding with ColQwen2/ColPali models. Perfect for research papers, technical documents, and any PDFs with complex layouts, figures, or diagrams.
|
|
||||||
|
|
||||||
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Build index from PDFs
|
|
||||||
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
|
|
||||||
|
|
||||||
# Search with text queries
|
|
||||||
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
|
|
||||||
|
|
||||||
# Interactive Q&A
|
|
||||||
python -m apps.colqwen_rag ask research_papers --interactive
|
|
||||||
```
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary><strong>📋 Click to expand: ColQwen Setup & Usage</strong></summary>
|
|
||||||
|
|
||||||
#### Prerequisites
|
|
||||||
```bash
|
|
||||||
# Install dependencies
|
|
||||||
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
|
|
||||||
brew install poppler # macOS only, for PDF processing
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Build Index
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag build \
|
|
||||||
--pdfs ./pdf_directory/ \
|
|
||||||
--index my_index \
|
|
||||||
--model colqwen2 # or colpali
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Search
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Models
|
|
||||||
- **ColQwen2** (`colqwen2`): Latest vision-language model with improved performance
|
|
||||||
- **ColPali** (`colpali`): Proven multimodal retriever
|
|
||||||
|
|
||||||
For detailed usage, see the [ColQwen Guide](docs/COLQWEN_GUIDE.md).
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
||||||
|
|
||||||
> **Note:** The examples below currently support macOS only. Windows support coming soon.
|
> **Note:** The examples below currently support macOS only. Windows support coming soon.
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Provides common parameters and functionality for all RAG examples.
|
|||||||
import argparse
|
import argparse
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Union
|
from typing import Any
|
||||||
|
|
||||||
import dotenv
|
import dotenv
|
||||||
from leann.api import LeannBuilder, LeannChat
|
from leann.api import LeannBuilder, LeannChat
|
||||||
@@ -257,8 +257,8 @@ class BaseRAGExample(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
|
async def load_data(self, args) -> list[str]:
|
||||||
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
|
"""Load data from the source. Returns list of text chunks."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_llm_config(self, args) -> dict[str, Any]:
|
def get_llm_config(self, args) -> dict[str, Any]:
|
||||||
@@ -282,8 +282,8 @@ class BaseRAGExample(ABC):
|
|||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
|
async def build_index(self, args, texts: list[str]) -> str:
|
||||||
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
|
"""Build LEANN index from texts."""
|
||||||
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
||||||
|
|
||||||
print(f"\n[Building Index] Creating {self.name} index...")
|
print(f"\n[Building Index] Creating {self.name} index...")
|
||||||
@@ -314,14 +314,8 @@ class BaseRAGExample(ABC):
|
|||||||
batch_size = 1000
|
batch_size = 1000
|
||||||
for i in range(0, len(texts), batch_size):
|
for i in range(0, len(texts), batch_size):
|
||||||
batch = texts[i : i + batch_size]
|
batch = texts[i : i + batch_size]
|
||||||
for item in batch:
|
for text in batch:
|
||||||
# Handle both dict format (from create_text_chunks) and plain strings
|
builder.add_text(text)
|
||||||
if isinstance(item, dict):
|
|
||||||
text = item.get("text", "")
|
|
||||||
metadata = item.get("metadata")
|
|
||||||
builder.add_text(text, metadata)
|
|
||||||
else:
|
|
||||||
builder.add_text(item)
|
|
||||||
print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
|
print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...")
|
||||||
|
|
||||||
print("Building index structure...")
|
print("Building index structure...")
|
||||||
|
|||||||
@@ -1,364 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
ColQwen RAG - Easy-to-use multimodal PDF retrieval with ColQwen2/ColPali
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python -m apps.colqwen_rag build --pdfs ./my_pdfs/ --index my_index
|
|
||||||
python -m apps.colqwen_rag search my_index "How does attention work?"
|
|
||||||
python -m apps.colqwen_rag ask my_index --interactive
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, cast
|
|
||||||
|
|
||||||
# Add LEANN packages to path
|
|
||||||
_repo_root = Path(__file__).resolve().parents[1]
|
|
||||||
_leann_core_src = _repo_root / "packages" / "leann-core" / "src"
|
|
||||||
_leann_hnsw_pkg = _repo_root / "packages" / "leann-backend-hnsw"
|
|
||||||
if str(_leann_core_src) not in sys.path:
|
|
||||||
sys.path.append(str(_leann_core_src))
|
|
||||||
if str(_leann_hnsw_pkg) not in sys.path:
|
|
||||||
sys.path.append(str(_leann_hnsw_pkg))
|
|
||||||
|
|
||||||
import torch # noqa: E402
|
|
||||||
from colpali_engine import ColPali, ColPaliProcessor, ColQwen2, ColQwen2Processor # noqa: E402
|
|
||||||
from colpali_engine.utils.torch_utils import ListDataset # noqa: E402
|
|
||||||
from pdf2image import convert_from_path # noqa: E402
|
|
||||||
from PIL import Image # noqa: E402
|
|
||||||
from torch.utils.data import DataLoader # noqa: E402
|
|
||||||
from tqdm import tqdm # noqa: E402
|
|
||||||
|
|
||||||
# Import the existing multi-vector implementation
|
|
||||||
sys.path.append(str(_repo_root / "apps" / "multimodal" / "vision-based-pdf-multi-vector"))
|
|
||||||
from leann_multi_vector import LeannMultiVector # noqa: E402
|
|
||||||
|
|
||||||
|
|
||||||
class ColQwenRAG:
|
|
||||||
"""Easy-to-use ColQwen RAG system for multimodal PDF retrieval."""
|
|
||||||
|
|
||||||
def __init__(self, model_type: str = "colpali"):
|
|
||||||
"""
|
|
||||||
Initialize ColQwen RAG system.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model_type: "colqwen2" or "colpali"
|
|
||||||
"""
|
|
||||||
self.model_type = model_type
|
|
||||||
self.device = self._get_device()
|
|
||||||
# Use float32 on MPS to avoid memory issues, float16 on CUDA, bfloat16 on CPU
|
|
||||||
if self.device.type == "mps":
|
|
||||||
self.dtype = torch.float32
|
|
||||||
elif self.device.type == "cuda":
|
|
||||||
self.dtype = torch.float16
|
|
||||||
else:
|
|
||||||
self.dtype = torch.bfloat16
|
|
||||||
|
|
||||||
print(f"🚀 Initializing {model_type.upper()} on {self.device} with {self.dtype}")
|
|
||||||
|
|
||||||
# Load model and processor with MPS-optimized settings
|
|
||||||
try:
|
|
||||||
if model_type == "colqwen2":
|
|
||||||
self.model_name = "vidore/colqwen2-v1.0"
|
|
||||||
if self.device.type == "mps":
|
|
||||||
# For MPS, load on CPU first then move to avoid memory allocation issues
|
|
||||||
self.model = ColQwen2.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map="cpu",
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
self.model = self.model.to(self.device)
|
|
||||||
else:
|
|
||||||
self.model = ColQwen2.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map=self.device,
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
self.processor = ColQwen2Processor.from_pretrained(self.model_name)
|
|
||||||
else: # colpali
|
|
||||||
self.model_name = "vidore/colpali-v1.2"
|
|
||||||
if self.device.type == "mps":
|
|
||||||
# For MPS, load on CPU first then move to avoid memory allocation issues
|
|
||||||
self.model = ColPali.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map="cpu",
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
self.model = self.model.to(self.device)
|
|
||||||
else:
|
|
||||||
self.model = ColPali.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map=self.device,
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
self.processor = ColPaliProcessor.from_pretrained(self.model_name)
|
|
||||||
except Exception as e:
|
|
||||||
if "memory" in str(e).lower() or "offload" in str(e).lower():
|
|
||||||
print(f"⚠️ Memory constraint on {self.device}, using CPU with optimizations...")
|
|
||||||
self.device = torch.device("cpu")
|
|
||||||
self.dtype = torch.float32
|
|
||||||
|
|
||||||
if model_type == "colqwen2":
|
|
||||||
self.model = ColQwen2.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map="cpu",
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
else:
|
|
||||||
self.model = ColPali.from_pretrained(
|
|
||||||
self.model_name,
|
|
||||||
torch_dtype=self.dtype,
|
|
||||||
device_map="cpu",
|
|
||||||
low_cpu_mem_usage=True,
|
|
||||||
).eval()
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _get_device(self):
|
|
||||||
"""Auto-select best available device."""
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
return torch.device("cuda")
|
|
||||||
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
||||||
return torch.device("mps")
|
|
||||||
else:
|
|
||||||
return torch.device("cpu")
|
|
||||||
|
|
||||||
def build_index(self, pdf_paths: list[str], index_name: str, pages_dir: Optional[str] = None):
|
|
||||||
"""
|
|
||||||
Build multimodal index from PDF files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_paths: List of PDF file paths
|
|
||||||
index_name: Name for the index
|
|
||||||
pages_dir: Directory to save page images (optional)
|
|
||||||
"""
|
|
||||||
print(f"Building index '{index_name}' from {len(pdf_paths)} PDFs...")
|
|
||||||
|
|
||||||
# Convert PDFs to images
|
|
||||||
all_images = []
|
|
||||||
all_metadata = []
|
|
||||||
|
|
||||||
if pages_dir:
|
|
||||||
os.makedirs(pages_dir, exist_ok=True)
|
|
||||||
|
|
||||||
for pdf_path in tqdm(pdf_paths, desc="Converting PDFs"):
|
|
||||||
try:
|
|
||||||
images = convert_from_path(pdf_path, dpi=150)
|
|
||||||
pdf_name = Path(pdf_path).stem
|
|
||||||
|
|
||||||
for i, image in enumerate(images):
|
|
||||||
# Save image if pages_dir specified
|
|
||||||
if pages_dir:
|
|
||||||
image_path = Path(pages_dir) / f"{pdf_name}_page_{i + 1}.png"
|
|
||||||
image.save(image_path)
|
|
||||||
|
|
||||||
all_images.append(image)
|
|
||||||
all_metadata.append(
|
|
||||||
{
|
|
||||||
"pdf_path": pdf_path,
|
|
||||||
"pdf_name": pdf_name,
|
|
||||||
"page_number": i + 1,
|
|
||||||
"image_path": str(image_path) if pages_dir else None,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error processing {pdf_path}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"📄 Converted {len(all_images)} pages from {len(pdf_paths)} PDFs")
|
|
||||||
print(f"All metadata: {all_metadata}")
|
|
||||||
|
|
||||||
# Generate embeddings
|
|
||||||
print("🧠 Generating embeddings...")
|
|
||||||
embeddings = self._embed_images(all_images)
|
|
||||||
|
|
||||||
# Build LEANN index
|
|
||||||
print("🔍 Building LEANN index...")
|
|
||||||
leann_mv = LeannMultiVector(
|
|
||||||
index_path=index_name,
|
|
||||||
dim=embeddings.shape[-1],
|
|
||||||
embedding_model_name=self.model_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create collection and insert data
|
|
||||||
leann_mv.create_collection()
|
|
||||||
for i, (embedding, metadata) in enumerate(zip(embeddings, all_metadata)):
|
|
||||||
data = {
|
|
||||||
"doc_id": i,
|
|
||||||
"filepath": metadata.get("image_path", ""),
|
|
||||||
"colbert_vecs": embedding.numpy(), # Convert tensor to numpy
|
|
||||||
}
|
|
||||||
leann_mv.insert(data)
|
|
||||||
|
|
||||||
# Build the index
|
|
||||||
leann_mv.create_index()
|
|
||||||
print(f"✅ Index '{index_name}' built successfully!")
|
|
||||||
|
|
||||||
return leann_mv
|
|
||||||
|
|
||||||
def search(self, index_name: str, query: str, top_k: int = 5):
|
|
||||||
"""
|
|
||||||
Search the index with a text query.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
index_name: Name of the index to search
|
|
||||||
query: Text query
|
|
||||||
top_k: Number of results to return
|
|
||||||
"""
|
|
||||||
print(f"🔍 Searching '{index_name}' for: '{query}'")
|
|
||||||
|
|
||||||
# Load index
|
|
||||||
leann_mv = LeannMultiVector(
|
|
||||||
index_path=index_name,
|
|
||||||
dim=128, # Will be updated when loading
|
|
||||||
embedding_model_name=self.model_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate query embedding
|
|
||||||
query_embedding = self._embed_query(query)
|
|
||||||
|
|
||||||
# Search (returns list of (score, doc_id) tuples)
|
|
||||||
search_results = leann_mv.search(query_embedding.numpy(), topk=top_k)
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
print(f"\n📋 Top {len(search_results)} results:")
|
|
||||||
for i, (score, doc_id) in enumerate(search_results, 1):
|
|
||||||
# Get metadata for this doc_id (we need to load the metadata)
|
|
||||||
print(f"{i}. Score: {score:.3f} | Doc ID: {doc_id}")
|
|
||||||
|
|
||||||
return search_results
|
|
||||||
|
|
||||||
def ask(self, index_name: str, interactive: bool = False):
|
|
||||||
"""
|
|
||||||
Interactive Q&A with the indexed documents.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
index_name: Name of the index to query
|
|
||||||
interactive: Whether to run in interactive mode
|
|
||||||
"""
|
|
||||||
print(f"💬 ColQwen Chat with '{index_name}'")
|
|
||||||
|
|
||||||
if interactive:
|
|
||||||
print("Type 'quit' to exit, 'help' for commands")
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
query = input("\n🤔 Your question: ").strip()
|
|
||||||
if query.lower() in ["quit", "exit", "q"]:
|
|
||||||
break
|
|
||||||
elif query.lower() == "help":
|
|
||||||
print("Commands: quit/exit/q (exit), help (this message)")
|
|
||||||
continue
|
|
||||||
elif not query:
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.search(index_name, query, top_k=3)
|
|
||||||
|
|
||||||
# TODO: Add answer generation with Qwen-VL
|
|
||||||
print("\n💡 For detailed answers, we can integrate Qwen-VL here!")
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n👋 Goodbye!")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
query = input("🤔 Your question: ").strip()
|
|
||||||
if query:
|
|
||||||
self.search(index_name, query)
|
|
||||||
|
|
||||||
def _embed_images(self, images: list[Image.Image]) -> torch.Tensor:
|
|
||||||
"""Generate embeddings for a list of images."""
|
|
||||||
dataset = ListDataset(images)
|
|
||||||
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=lambda x: x)
|
|
||||||
|
|
||||||
embeddings = []
|
|
||||||
with torch.no_grad():
|
|
||||||
for batch in tqdm(dataloader, desc="Embedding images"):
|
|
||||||
batch_images = cast(list, batch)
|
|
||||||
batch_inputs = self.processor.process_images(batch_images).to(self.device)
|
|
||||||
batch_embeddings = self.model(**batch_inputs)
|
|
||||||
embeddings.append(batch_embeddings.cpu())
|
|
||||||
|
|
||||||
return torch.cat(embeddings, dim=0)
|
|
||||||
|
|
||||||
def _embed_query(self, query: str) -> torch.Tensor:
|
|
||||||
"""Generate embedding for a text query."""
|
|
||||||
with torch.no_grad():
|
|
||||||
query_inputs = self.processor.process_queries([query]).to(self.device)
|
|
||||||
query_embedding = self.model(**query_inputs)
|
|
||||||
return query_embedding.cpu()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="ColQwen RAG - Easy multimodal PDF retrieval")
|
|
||||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
||||||
|
|
||||||
# Build command
|
|
||||||
build_parser = subparsers.add_parser("build", help="Build index from PDFs")
|
|
||||||
build_parser.add_argument("--pdfs", required=True, help="Directory containing PDF files")
|
|
||||||
build_parser.add_argument("--index", required=True, help="Index name")
|
|
||||||
build_parser.add_argument(
|
|
||||||
"--model", choices=["colqwen2", "colpali"], default="colqwen2", help="Model to use"
|
|
||||||
)
|
|
||||||
build_parser.add_argument("--pages-dir", help="Directory to save page images")
|
|
||||||
|
|
||||||
# Search command
|
|
||||||
search_parser = subparsers.add_parser("search", help="Search the index")
|
|
||||||
search_parser.add_argument("index", help="Index name")
|
|
||||||
search_parser.add_argument("query", help="Search query")
|
|
||||||
search_parser.add_argument("--top-k", type=int, default=5, help="Number of results")
|
|
||||||
search_parser.add_argument(
|
|
||||||
"--model", choices=["colqwen2", "colpali"], default="colqwen2", help="Model to use"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ask command
|
|
||||||
ask_parser = subparsers.add_parser("ask", help="Interactive Q&A")
|
|
||||||
ask_parser.add_argument("index", help="Index name")
|
|
||||||
ask_parser.add_argument("--interactive", action="store_true", help="Interactive mode")
|
|
||||||
ask_parser.add_argument(
|
|
||||||
"--model", choices=["colqwen2", "colpali"], default="colqwen2", help="Model to use"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Initialize ColQwen RAG
|
|
||||||
if args.command == "build":
|
|
||||||
colqwen = ColQwenRAG(args.model)
|
|
||||||
|
|
||||||
# Get PDF files
|
|
||||||
pdf_dir = Path(args.pdfs)
|
|
||||||
if pdf_dir.is_file() and pdf_dir.suffix.lower() == ".pdf":
|
|
||||||
pdf_paths = [str(pdf_dir)]
|
|
||||||
elif pdf_dir.is_dir():
|
|
||||||
pdf_paths = [str(p) for p in pdf_dir.glob("*.pdf")]
|
|
||||||
else:
|
|
||||||
print(f"❌ Invalid PDF path: {args.pdfs}")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not pdf_paths:
|
|
||||||
print(f"❌ No PDF files found in {args.pdfs}")
|
|
||||||
return
|
|
||||||
|
|
||||||
colqwen.build_index(pdf_paths, args.index, args.pages_dir)
|
|
||||||
|
|
||||||
elif args.command == "search":
|
|
||||||
colqwen = ColQwenRAG(args.model)
|
|
||||||
colqwen.search(args.index, args.query, args.top_k)
|
|
||||||
|
|
||||||
elif args.command == "ask":
|
|
||||||
colqwen = ColQwenRAG(args.model)
|
|
||||||
colqwen.ask(args.index, args.interactive)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -5,7 +5,6 @@ Supports PDF, TXT, MD, and other document formats.
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
# Add parent directory to path for imports
|
# Add parent directory to path for imports
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
@@ -52,7 +51,7 @@ class DocumentRAG(BaseRAGExample):
|
|||||||
help="Enable AST-aware chunking for code files in the data directory",
|
help="Enable AST-aware chunking for code files in the data directory",
|
||||||
)
|
)
|
||||||
|
|
||||||
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
|
async def load_data(self, args) -> list[str]:
|
||||||
"""Load documents and convert to text chunks."""
|
"""Load documents and convert to text chunks."""
|
||||||
print(f"Loading documents from: {args.data_dir}")
|
print(f"Loading documents from: {args.data_dir}")
|
||||||
if args.file_types:
|
if args.file_types:
|
||||||
|
|||||||
@@ -1,218 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
CLIP Image RAG Application
|
|
||||||
|
|
||||||
This application enables RAG (Retrieval-Augmented Generation) on images using CLIP embeddings.
|
|
||||||
You can index a directory of images and search them using text queries.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python -m apps.image_rag --image-dir ./my_images/ --query "a sunset over mountains"
|
|
||||||
python -m apps.image_rag --image-dir ./my_images/ --interactive
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import pickle
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from apps.base_rag_example import BaseRAGExample
|
|
||||||
|
|
||||||
|
|
||||||
class ImageRAG(BaseRAGExample):
|
|
||||||
"""
|
|
||||||
RAG application for images using CLIP embeddings.
|
|
||||||
|
|
||||||
This class provides a complete RAG pipeline for image data, including
|
|
||||||
CLIP embedding generation, indexing, and text-based image search.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__(
|
|
||||||
name="Image RAG",
|
|
||||||
description="RAG application for images using CLIP embeddings",
|
|
||||||
default_index_name="image_index",
|
|
||||||
)
|
|
||||||
# Override default embedding model to use CLIP
|
|
||||||
self.embedding_model_default = "clip-ViT-L-14"
|
|
||||||
self.embedding_mode_default = "sentence-transformers"
|
|
||||||
self._image_data: list[dict] = []
|
|
||||||
|
|
||||||
def _add_specific_arguments(self, parser: argparse.ArgumentParser):
|
|
||||||
"""Add image-specific arguments."""
|
|
||||||
image_group = parser.add_argument_group("Image Parameters")
|
|
||||||
image_group.add_argument(
|
|
||||||
"--image-dir",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory containing images to index",
|
|
||||||
)
|
|
||||||
image_group.add_argument(
|
|
||||||
"--image-extensions",
|
|
||||||
type=str,
|
|
||||||
nargs="+",
|
|
||||||
default=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"],
|
|
||||||
help="Image file extensions to process (default: .jpg .jpeg .png .gif .bmp .webp)",
|
|
||||||
)
|
|
||||||
image_group.add_argument(
|
|
||||||
"--batch-size",
|
|
||||||
type=int,
|
|
||||||
default=32,
|
|
||||||
help="Batch size for CLIP embedding generation (default: 32)",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def load_data(self, args) -> list[str]:
|
|
||||||
"""Load images, generate CLIP embeddings, and return text descriptions."""
|
|
||||||
self._image_data = self._load_images_and_embeddings(args)
|
|
||||||
return [entry["text"] for entry in self._image_data]
|
|
||||||
|
|
||||||
def _load_images_and_embeddings(self, args) -> list[dict]:
|
|
||||||
"""Helper to process images and produce embeddings/metadata."""
|
|
||||||
image_dir = Path(args.image_dir)
|
|
||||||
if not image_dir.exists():
|
|
||||||
raise ValueError(f"Image directory does not exist: {image_dir}")
|
|
||||||
|
|
||||||
print(f"📸 Loading images from {image_dir}...")
|
|
||||||
|
|
||||||
# Find all image files
|
|
||||||
image_files = []
|
|
||||||
for ext in args.image_extensions:
|
|
||||||
image_files.extend(image_dir.rglob(f"*{ext}"))
|
|
||||||
image_files.extend(image_dir.rglob(f"*{ext.upper()}"))
|
|
||||||
|
|
||||||
if not image_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"No images found in {image_dir} with extensions {args.image_extensions}"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"✅ Found {len(image_files)} images")
|
|
||||||
|
|
||||||
# Limit if max_items is set
|
|
||||||
if args.max_items > 0:
|
|
||||||
image_files = image_files[: args.max_items]
|
|
||||||
print(f"📊 Processing {len(image_files)} images (limited by --max-items)")
|
|
||||||
|
|
||||||
# Load CLIP model
|
|
||||||
print("🔍 Loading CLIP model...")
|
|
||||||
model = SentenceTransformer(self.embedding_model_default)
|
|
||||||
|
|
||||||
# Process images and generate embeddings
|
|
||||||
print("🖼️ Processing images and generating embeddings...")
|
|
||||||
image_data = []
|
|
||||||
batch_images = []
|
|
||||||
batch_paths = []
|
|
||||||
|
|
||||||
for image_path in tqdm(image_files, desc="Processing images"):
|
|
||||||
try:
|
|
||||||
image = Image.open(image_path).convert("RGB")
|
|
||||||
batch_images.append(image)
|
|
||||||
batch_paths.append(image_path)
|
|
||||||
|
|
||||||
# Process in batches
|
|
||||||
if len(batch_images) >= args.batch_size:
|
|
||||||
embeddings = model.encode(
|
|
||||||
batch_images,
|
|
||||||
convert_to_numpy=True,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
for img_path, embedding in zip(batch_paths, embeddings):
|
|
||||||
image_data.append(
|
|
||||||
{
|
|
||||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
|
||||||
"metadata": {
|
|
||||||
"image_path": str(img_path),
|
|
||||||
"image_name": img_path.name,
|
|
||||||
"image_dir": str(image_dir),
|
|
||||||
},
|
|
||||||
"embedding": embedding.astype(np.float32),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
batch_images = []
|
|
||||||
batch_paths = []
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ Failed to process {image_path}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Process remaining images
|
|
||||||
if batch_images:
|
|
||||||
embeddings = model.encode(
|
|
||||||
batch_images,
|
|
||||||
convert_to_numpy=True,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
batch_size=len(batch_images),
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
for img_path, embedding in zip(batch_paths, embeddings):
|
|
||||||
image_data.append(
|
|
||||||
{
|
|
||||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
|
||||||
"metadata": {
|
|
||||||
"image_path": str(img_path),
|
|
||||||
"image_name": img_path.name,
|
|
||||||
"image_dir": str(image_dir),
|
|
||||||
},
|
|
||||||
"embedding": embedding.astype(np.float32),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"✅ Processed {len(image_data)} images")
|
|
||||||
return image_data
|
|
||||||
|
|
||||||
async def build_index(self, args, texts: list[str]) -> str:
|
|
||||||
"""Build index using pre-computed CLIP embeddings."""
|
|
||||||
from leann.api import LeannBuilder
|
|
||||||
|
|
||||||
if not self._image_data or len(self._image_data) != len(texts):
|
|
||||||
raise RuntimeError("No image data found. Make sure load_data() ran successfully.")
|
|
||||||
|
|
||||||
print("🔨 Building LEANN index with CLIP embeddings...")
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name=args.backend_name,
|
|
||||||
embedding_model=self.embedding_model_default,
|
|
||||||
embedding_mode=self.embedding_mode_default,
|
|
||||||
is_recompute=False,
|
|
||||||
distance_metric="cosine",
|
|
||||||
graph_degree=args.graph_degree,
|
|
||||||
build_complexity=args.build_complexity,
|
|
||||||
is_compact=not args.no_compact,
|
|
||||||
)
|
|
||||||
|
|
||||||
for text, data in zip(texts, self._image_data):
|
|
||||||
builder.add_text(text=text, metadata=data["metadata"])
|
|
||||||
|
|
||||||
ids = [str(i) for i in range(len(self._image_data))]
|
|
||||||
embeddings = np.array([data["embedding"] for data in self._image_data], dtype=np.float32)
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pkl", delete=False) as f:
|
|
||||||
pickle.dump((ids, embeddings), f)
|
|
||||||
pkl_path = f.name
|
|
||||||
|
|
||||||
try:
|
|
||||||
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
|
||||||
builder.build_index_from_embeddings(index_path, pkl_path)
|
|
||||||
print(f"✅ Index built successfully at {index_path}")
|
|
||||||
return index_path
|
|
||||||
finally:
|
|
||||||
Path(pkl_path).unlink()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point for the image RAG application."""
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
app = ImageRAG()
|
|
||||||
asyncio.run(app.run())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
# ColQwen Integration Guide
|
|
||||||
|
|
||||||
Easy-to-use multimodal PDF retrieval with ColQwen2/ColPali models.
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
|
|
||||||
|
|
||||||
### 1. Install Dependencies
|
|
||||||
```bash
|
|
||||||
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
|
|
||||||
brew install poppler # macOS only, for PDF processing
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Basic Usage
|
|
||||||
```bash
|
|
||||||
# Build index from PDFs
|
|
||||||
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
|
|
||||||
|
|
||||||
# Search with text queries
|
|
||||||
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
|
|
||||||
|
|
||||||
# Interactive Q&A
|
|
||||||
python -m apps.colqwen_rag ask research_papers --interactive
|
|
||||||
```
|
|
||||||
|
|
||||||
## Commands
|
|
||||||
|
|
||||||
### Build Index
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag build \
|
|
||||||
--pdfs ./pdf_directory/ \
|
|
||||||
--index my_index \
|
|
||||||
--model colqwen2 \
|
|
||||||
--pages-dir ./page_images/ # Optional: save page images
|
|
||||||
```
|
|
||||||
|
|
||||||
**Options:**
|
|
||||||
- `--pdfs`: Directory containing PDF files (or single PDF path)
|
|
||||||
- `--index`: Name for the index (required)
|
|
||||||
- `--model`: `colqwen2` (default) or `colpali`
|
|
||||||
- `--pages-dir`: Directory to save page images (optional)
|
|
||||||
|
|
||||||
### Search Index
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
|
|
||||||
```
|
|
||||||
|
|
||||||
**Options:**
|
|
||||||
- `--top-k`: Number of results to return (default: 5)
|
|
||||||
- `--model`: Model used for search (should match build model)
|
|
||||||
|
|
||||||
### Interactive Q&A
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag ask my_index --interactive
|
|
||||||
```
|
|
||||||
|
|
||||||
**Commands in interactive mode:**
|
|
||||||
- Type your questions naturally
|
|
||||||
- `help`: Show available commands
|
|
||||||
- `quit`/`exit`/`q`: Exit interactive mode
|
|
||||||
|
|
||||||
## 🧪 Test & Reproduce Results
|
|
||||||
|
|
||||||
Run the reproduction test for issue #119:
|
|
||||||
```bash
|
|
||||||
python test_colqwen_reproduction.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This will:
|
|
||||||
1. ✅ Check dependencies
|
|
||||||
2. 📥 Download sample PDF (Attention Is All You Need paper)
|
|
||||||
3. 🏗️ Build test index
|
|
||||||
4. 🔍 Run sample queries
|
|
||||||
5. 📊 Show how to generate similarity maps
|
|
||||||
|
|
||||||
## 🎨 Advanced: Similarity Maps
|
|
||||||
|
|
||||||
For visual similarity analysis, use the existing advanced script:
|
|
||||||
```bash
|
|
||||||
cd apps/multimodal/vision-based-pdf-multi-vector/
|
|
||||||
python multi-vector-leann-similarity-map.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Edit the script to customize:
|
|
||||||
- `QUERY`: Your question
|
|
||||||
- `MODEL`: "colqwen2" or "colpali"
|
|
||||||
- `USE_HF_DATASET`: Use HuggingFace dataset or local PDFs
|
|
||||||
- `SIMILARITY_MAP`: Generate heatmaps
|
|
||||||
- `ANSWER`: Enable Qwen-VL answer generation
|
|
||||||
|
|
||||||
## 🔧 How It Works
|
|
||||||
|
|
||||||
### ColQwen2 vs ColPali
|
|
||||||
- **ColQwen2** (`vidore/colqwen2-v1.0`): Latest vision-language model
|
|
||||||
- **ColPali** (`vidore/colpali-v1.2`): Proven multimodal retriever
|
|
||||||
|
|
||||||
### Architecture
|
|
||||||
1. **PDF → Images**: Convert PDF pages to images (150 DPI)
|
|
||||||
2. **Vision Encoding**: Process images with ColQwen2/ColPali
|
|
||||||
3. **Multi-Vector Index**: Build LEANN HNSW index with multiple embeddings per page
|
|
||||||
4. **Query Processing**: Encode text queries with same model
|
|
||||||
5. **Similarity Search**: Find most relevant pages/regions
|
|
||||||
6. **Visual Maps**: Generate attention heatmaps (optional)
|
|
||||||
|
|
||||||
### Device Support
|
|
||||||
- **CUDA**: Best performance with GPU acceleration
|
|
||||||
- **MPS**: Apple Silicon Mac support
|
|
||||||
- **CPU**: Fallback for any system (slower)
|
|
||||||
|
|
||||||
Auto-detection: CUDA > MPS > CPU
|
|
||||||
|
|
||||||
## 📊 Performance Tips
|
|
||||||
|
|
||||||
### For Best Performance:
|
|
||||||
```bash
|
|
||||||
# Use ColQwen2 for latest features
|
|
||||||
--model colqwen2
|
|
||||||
|
|
||||||
# Save page images for reuse
|
|
||||||
--pages-dir ./cached_pages/
|
|
||||||
|
|
||||||
# Adjust batch size based on GPU memory
|
|
||||||
# (automatically handled)
|
|
||||||
```
|
|
||||||
|
|
||||||
### For Large Document Sets:
|
|
||||||
- Process PDFs in batches
|
|
||||||
- Use SSD storage for index files
|
|
||||||
- Consider using CUDA if available
|
|
||||||
|
|
||||||
## 🔗 Related Resources
|
|
||||||
|
|
||||||
- **Fast-PLAID**: https://github.com/lightonai/fast-plaid
|
|
||||||
- **Pylate**: https://github.com/lightonai/pylate
|
|
||||||
- **ColBERT**: https://github.com/stanford-futuredata/ColBERT
|
|
||||||
- **ColPali Paper**: Vision-Language Models for Document Retrieval
|
|
||||||
- **Issue #119**: https://github.com/yichuan-w/LEANN/issues/119
|
|
||||||
|
|
||||||
## 🐛 Troubleshooting
|
|
||||||
|
|
||||||
### PDF Conversion Issues (macOS)
|
|
||||||
```bash
|
|
||||||
# Install poppler
|
|
||||||
brew install poppler
|
|
||||||
which pdfinfo && pdfinfo -v
|
|
||||||
```
|
|
||||||
|
|
||||||
### Memory Issues
|
|
||||||
- Reduce batch size (automatically handled)
|
|
||||||
- Use CPU instead of GPU: `export CUDA_VISIBLE_DEVICES=""`
|
|
||||||
- Process fewer PDFs at once
|
|
||||||
|
|
||||||
### Model Download Issues
|
|
||||||
- Ensure internet connection for first run
|
|
||||||
- Models are cached after first download
|
|
||||||
- Use HuggingFace mirrors if needed
|
|
||||||
|
|
||||||
### Import Errors
|
|
||||||
```bash
|
|
||||||
# Ensure all dependencies installed
|
|
||||||
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
|
|
||||||
|
|
||||||
# Check PyTorch installation
|
|
||||||
python -c "import torch; print(torch.__version__)"
|
|
||||||
```
|
|
||||||
|
|
||||||
## 💡 Examples
|
|
||||||
|
|
||||||
### Research Paper Analysis
|
|
||||||
```bash
|
|
||||||
# Index your research papers
|
|
||||||
python -m apps.colqwen_rag build --pdfs ~/Papers/AI/ --index ai_papers
|
|
||||||
|
|
||||||
# Ask research questions
|
|
||||||
python -m apps.colqwen_rag search ai_papers "What are the limitations of transformer models?"
|
|
||||||
python -m apps.colqwen_rag search ai_papers "How does BERT compare to GPT?"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Document Q&A
|
|
||||||
```bash
|
|
||||||
# Index business documents
|
|
||||||
python -m apps.colqwen_rag build --pdfs ~/Documents/Reports/ --index reports
|
|
||||||
|
|
||||||
# Interactive analysis
|
|
||||||
python -m apps.colqwen_rag ask reports --interactive
|
|
||||||
```
|
|
||||||
|
|
||||||
### Visual Analysis
|
|
||||||
```bash
|
|
||||||
# Generate similarity maps for specific queries
|
|
||||||
cd apps/multimodal/vision-based-pdf-multi-vector/
|
|
||||||
# Edit multi-vector-leann-similarity-map.py with your query
|
|
||||||
python multi-vector-leann-similarity-map.py
|
|
||||||
# Check ./figures/ for generated heatmaps
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**🎯 This integration makes ColQwen as easy to use as other LEANN features while maintaining the full power of multimodal document understanding!**
|
|
||||||
@@ -7,7 +7,7 @@ name = "leann-core"
|
|||||||
version = "0.3.5"
|
version = "0.3.5"
|
||||||
description = "Core API and plugin system for LEANN"
|
description = "Core API and plugin system for LEANN"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.9"
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
|
|
||||||
# All required dependencies included
|
# All required dependencies included
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ name = "leann"
|
|||||||
version = "0.3.5"
|
version = "0.3.5"
|
||||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.9"
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
authors = [
|
authors = [
|
||||||
{ name = "LEANN Team" }
|
{ name = "LEANN Team" }
|
||||||
@@ -18,10 +18,10 @@ classifiers = [
|
|||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"License :: OSI Approved :: MIT License",
|
"License :: OSI Approved :: MIT License",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
"Programming Language :: Python :: 3.13",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Default installation: core + hnsw + diskann
|
# Default installation: core + hnsw + diskann
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
[project]
|
[project]
|
||||||
name = "leann-workspace"
|
name = "leann-workspace"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.9"
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"leann-core",
|
"leann-core",
|
||||||
|
|||||||
Reference in New Issue
Block a user