Compare commits
6 Commits
fix/securi
...
feature/co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
360fdf575c | ||
|
|
0175bc9c20 | ||
|
|
af47dfdde7 | ||
|
|
f13bd02fbd | ||
|
|
86287d8832 | ||
|
|
13beb98164 |
48
README.md
48
README.md
@@ -379,6 +379,54 @@ python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authenticat
|
||||
|
||||
</details>
|
||||
|
||||
### 🎨 ColQwen: Multimodal PDF Retrieval with Vision-Language Models
|
||||
|
||||
Search through PDFs using both text and visual understanding with ColQwen2/ColPali models. Perfect for research papers, technical documents, and any PDFs with complex layouts, figures, or diagrams.
|
||||
|
||||
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
|
||||
|
||||
```bash
|
||||
# Build index from PDFs
|
||||
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
|
||||
|
||||
# Search with text queries
|
||||
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
|
||||
|
||||
# Interactive Q&A
|
||||
python -m apps.colqwen_rag ask research_papers --interactive
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>📋 Click to expand: ColQwen Setup & Usage</strong></summary>
|
||||
|
||||
#### Prerequisites
|
||||
```bash
|
||||
# Install dependencies
|
||||
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
|
||||
brew install poppler # macOS only, for PDF processing
|
||||
```
|
||||
|
||||
#### Build Index
|
||||
```bash
|
||||
python -m apps.colqwen_rag build \
|
||||
--pdfs ./pdf_directory/ \
|
||||
--index my_index \
|
||||
--model colqwen2 # or colpali
|
||||
```
|
||||
|
||||
#### Search
|
||||
```bash
|
||||
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
|
||||
```
|
||||
|
||||
#### Models
|
||||
- **ColQwen2** (`colqwen2`): Latest vision-language model with improved performance
|
||||
- **ColPali** (`colpali`): Proven multimodal retriever
|
||||
|
||||
For detailed usage, see the [ColQwen Guide](docs/COLQWEN_GUIDE.md).
|
||||
|
||||
</details>
|
||||
|
||||
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
||||
|
||||
> **Note:** The examples below currently support macOS only. Windows support coming soon.
|
||||
|
||||
218
apps/image_rag.py
Normal file
218
apps/image_rag.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CLIP Image RAG Application
|
||||
|
||||
This application enables RAG (Retrieval-Augmented Generation) on images using CLIP embeddings.
|
||||
You can index a directory of images and search them using text queries.
|
||||
|
||||
Usage:
|
||||
python -m apps.image_rag --image-dir ./my_images/ --query "a sunset over mountains"
|
||||
python -m apps.image_rag --image-dir ./my_images/ --interactive
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pickle
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
from apps.base_rag_example import BaseRAGExample
|
||||
|
||||
|
||||
class ImageRAG(BaseRAGExample):
|
||||
"""
|
||||
RAG application for images using CLIP embeddings.
|
||||
|
||||
This class provides a complete RAG pipeline for image data, including
|
||||
CLIP embedding generation, indexing, and text-based image search.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="Image RAG",
|
||||
description="RAG application for images using CLIP embeddings",
|
||||
default_index_name="image_index",
|
||||
)
|
||||
# Override default embedding model to use CLIP
|
||||
self.embedding_model_default = "clip-ViT-L-14"
|
||||
self.embedding_mode_default = "sentence-transformers"
|
||||
self._image_data: list[dict] = []
|
||||
|
||||
def _add_specific_arguments(self, parser: argparse.ArgumentParser):
|
||||
"""Add image-specific arguments."""
|
||||
image_group = parser.add_argument_group("Image Parameters")
|
||||
image_group.add_argument(
|
||||
"--image-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory containing images to index",
|
||||
)
|
||||
image_group.add_argument(
|
||||
"--image-extensions",
|
||||
type=str,
|
||||
nargs="+",
|
||||
default=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"],
|
||||
help="Image file extensions to process (default: .jpg .jpeg .png .gif .bmp .webp)",
|
||||
)
|
||||
image_group.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=32,
|
||||
help="Batch size for CLIP embedding generation (default: 32)",
|
||||
)
|
||||
|
||||
async def load_data(self, args) -> list[str]:
|
||||
"""Load images, generate CLIP embeddings, and return text descriptions."""
|
||||
self._image_data = self._load_images_and_embeddings(args)
|
||||
return [entry["text"] for entry in self._image_data]
|
||||
|
||||
def _load_images_and_embeddings(self, args) -> list[dict]:
|
||||
"""Helper to process images and produce embeddings/metadata."""
|
||||
image_dir = Path(args.image_dir)
|
||||
if not image_dir.exists():
|
||||
raise ValueError(f"Image directory does not exist: {image_dir}")
|
||||
|
||||
print(f"📸 Loading images from {image_dir}...")
|
||||
|
||||
# Find all image files
|
||||
image_files = []
|
||||
for ext in args.image_extensions:
|
||||
image_files.extend(image_dir.rglob(f"*{ext}"))
|
||||
image_files.extend(image_dir.rglob(f"*{ext.upper()}"))
|
||||
|
||||
if not image_files:
|
||||
raise ValueError(
|
||||
f"No images found in {image_dir} with extensions {args.image_extensions}"
|
||||
)
|
||||
|
||||
print(f"✅ Found {len(image_files)} images")
|
||||
|
||||
# Limit if max_items is set
|
||||
if args.max_items > 0:
|
||||
image_files = image_files[: args.max_items]
|
||||
print(f"📊 Processing {len(image_files)} images (limited by --max-items)")
|
||||
|
||||
# Load CLIP model
|
||||
print("🔍 Loading CLIP model...")
|
||||
model = SentenceTransformer(self.embedding_model_default)
|
||||
|
||||
# Process images and generate embeddings
|
||||
print("🖼️ Processing images and generating embeddings...")
|
||||
image_data = []
|
||||
batch_images = []
|
||||
batch_paths = []
|
||||
|
||||
for image_path in tqdm(image_files, desc="Processing images"):
|
||||
try:
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
batch_images.append(image)
|
||||
batch_paths.append(image_path)
|
||||
|
||||
# Process in batches
|
||||
if len(batch_images) >= args.batch_size:
|
||||
embeddings = model.encode(
|
||||
batch_images,
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=True,
|
||||
batch_size=args.batch_size,
|
||||
show_progress_bar=False,
|
||||
)
|
||||
|
||||
for img_path, embedding in zip(batch_paths, embeddings):
|
||||
image_data.append(
|
||||
{
|
||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
||||
"metadata": {
|
||||
"image_path": str(img_path),
|
||||
"image_name": img_path.name,
|
||||
"image_dir": str(image_dir),
|
||||
},
|
||||
"embedding": embedding.astype(np.float32),
|
||||
}
|
||||
)
|
||||
|
||||
batch_images = []
|
||||
batch_paths = []
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to process {image_path}: {e}")
|
||||
continue
|
||||
|
||||
# Process remaining images
|
||||
if batch_images:
|
||||
embeddings = model.encode(
|
||||
batch_images,
|
||||
convert_to_numpy=True,
|
||||
normalize_embeddings=True,
|
||||
batch_size=len(batch_images),
|
||||
show_progress_bar=False,
|
||||
)
|
||||
|
||||
for img_path, embedding in zip(batch_paths, embeddings):
|
||||
image_data.append(
|
||||
{
|
||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
||||
"metadata": {
|
||||
"image_path": str(img_path),
|
||||
"image_name": img_path.name,
|
||||
"image_dir": str(image_dir),
|
||||
},
|
||||
"embedding": embedding.astype(np.float32),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"✅ Processed {len(image_data)} images")
|
||||
return image_data
|
||||
|
||||
async def build_index(self, args, texts: list[str]) -> str:
|
||||
"""Build index using pre-computed CLIP embeddings."""
|
||||
from leann.api import LeannBuilder
|
||||
|
||||
if not self._image_data or len(self._image_data) != len(texts):
|
||||
raise RuntimeError("No image data found. Make sure load_data() ran successfully.")
|
||||
|
||||
print("🔨 Building LEANN index with CLIP embeddings...")
|
||||
builder = LeannBuilder(
|
||||
backend_name=args.backend_name,
|
||||
embedding_model=self.embedding_model_default,
|
||||
embedding_mode=self.embedding_mode_default,
|
||||
is_recompute=False,
|
||||
distance_metric="cosine",
|
||||
graph_degree=args.graph_degree,
|
||||
build_complexity=args.build_complexity,
|
||||
is_compact=not args.no_compact,
|
||||
)
|
||||
|
||||
for text, data in zip(texts, self._image_data):
|
||||
builder.add_text(text=text, metadata=data["metadata"])
|
||||
|
||||
ids = [str(i) for i in range(len(self._image_data))]
|
||||
embeddings = np.array([data["embedding"] for data in self._image_data], dtype=np.float32)
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pkl", delete=False) as f:
|
||||
pickle.dump((ids, embeddings), f)
|
||||
pkl_path = f.name
|
||||
|
||||
try:
|
||||
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
||||
builder.build_index_from_embeddings(index_path, pkl_path)
|
||||
print(f"✅ Index built successfully at {index_path}")
|
||||
return index_path
|
||||
finally:
|
||||
Path(pkl_path).unlink()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the image RAG application."""
|
||||
import asyncio
|
||||
|
||||
app = ImageRAG()
|
||||
asyncio.run(app.run())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -7,7 +7,6 @@ for indexing in LEANN. It supports various Slack MCP server implementations and
|
||||
flexible message processing options.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
@@ -147,16 +146,16 @@ class SlackMCPReader:
|
||||
match = re.search(r"'error':\s*(\{[^}]+\})", str(e))
|
||||
if match:
|
||||
try:
|
||||
error_dict = ast.literal_eval(match.group(1))
|
||||
except (ValueError, SyntaxError):
|
||||
error_dict = eval(match.group(1))
|
||||
except (ValueError, SyntaxError, NameError):
|
||||
pass
|
||||
else:
|
||||
# Try alternative format
|
||||
match = re.search(r"Failed to fetch messages:\s*(\{[^}]+\})", str(e))
|
||||
if match:
|
||||
try:
|
||||
error_dict = ast.literal_eval(match.group(1))
|
||||
except (ValueError, SyntaxError):
|
||||
error_dict = eval(match.group(1))
|
||||
except (ValueError, SyntaxError, NameError):
|
||||
pass
|
||||
|
||||
if self._is_cache_sync_error(error_dict):
|
||||
|
||||
@@ -60,20 +60,6 @@ python -m apps.colqwen_rag ask my_index --interactive
|
||||
- `help`: Show available commands
|
||||
- `quit`/`exit`/`q`: Exit interactive mode
|
||||
|
||||
## 🧪 Test & Reproduce Results
|
||||
|
||||
Run the reproduction test for issue #119:
|
||||
```bash
|
||||
python test_colqwen_reproduction.py
|
||||
```
|
||||
|
||||
This will:
|
||||
1. ✅ Check dependencies
|
||||
2. 📥 Download sample PDF (Attention Is All You Need paper)
|
||||
3. 🏗️ Build test index
|
||||
4. 🔍 Run sample queries
|
||||
5. 📊 Show how to generate similarity maps
|
||||
|
||||
## 🎨 Advanced: Similarity Maps
|
||||
|
||||
For visual similarity analysis, use the existing advanced script:
|
||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: 5952745237...e2d243c40d
@@ -1,162 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script to reproduce ColQwen results from issue #119
|
||||
https://github.com/yichuan-w/LEANN/issues/119
|
||||
|
||||
This script demonstrates the ColQwen workflow:
|
||||
1. Download sample PDF
|
||||
2. Convert to images
|
||||
3. Build multimodal index
|
||||
4. Run test queries
|
||||
5. Generate similarity maps
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
print("🧪 ColQwen Reproduction Test - Issue #119")
|
||||
print("=" * 50)
|
||||
|
||||
# Check if we're in the right directory
|
||||
repo_root = Path.cwd()
|
||||
if not (repo_root / "apps" / "colqwen_rag.py").exists():
|
||||
print("❌ Please run this script from the LEANN repository root")
|
||||
print(" cd /path/to/LEANN && python test_colqwen_reproduction.py")
|
||||
return
|
||||
|
||||
print("✅ Repository structure looks good")
|
||||
|
||||
# Step 1: Check dependencies
|
||||
print("\n📦 Checking dependencies...")
|
||||
try:
|
||||
import torch
|
||||
|
||||
# Check if pdf2image is available
|
||||
if importlib.util.find_spec("pdf2image") is None:
|
||||
raise ImportError("pdf2image not found")
|
||||
# Check if colpali_engine is available
|
||||
if importlib.util.find_spec("colpali_engine") is None:
|
||||
raise ImportError("colpali_engine not found")
|
||||
|
||||
print("✅ Core dependencies available")
|
||||
print(f" - PyTorch: {torch.__version__}")
|
||||
print(f" - CUDA available: {torch.cuda.is_available()}")
|
||||
print(
|
||||
f" - MPS available: {hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()}"
|
||||
)
|
||||
except ImportError as e:
|
||||
print(f"❌ Missing dependency: {e}")
|
||||
print("\n📥 Install missing dependencies:")
|
||||
print(
|
||||
" uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn"
|
||||
)
|
||||
return
|
||||
|
||||
# Step 2: Download sample PDF
|
||||
print("\n📄 Setting up sample PDF...")
|
||||
pdf_dir = repo_root / "test_pdfs"
|
||||
pdf_dir.mkdir(exist_ok=True)
|
||||
sample_pdf = pdf_dir / "attention_paper.pdf"
|
||||
|
||||
if not sample_pdf.exists():
|
||||
print("📥 Downloading sample paper (Attention Is All You Need)...")
|
||||
import urllib.request
|
||||
|
||||
try:
|
||||
urllib.request.urlretrieve("https://arxiv.org/pdf/1706.03762.pdf", sample_pdf)
|
||||
print(f"✅ Downloaded: {sample_pdf}")
|
||||
except Exception as e:
|
||||
print(f"❌ Download failed: {e}")
|
||||
print(" Please manually download a PDF to test_pdfs/attention_paper.pdf")
|
||||
return
|
||||
else:
|
||||
print(f"✅ Using existing PDF: {sample_pdf}")
|
||||
|
||||
# Step 3: Test ColQwen RAG
|
||||
print("\n🚀 Testing ColQwen RAG...")
|
||||
|
||||
# Build index
|
||||
print("\n1️⃣ Building multimodal index...")
|
||||
build_cmd = f"python -m apps.colqwen_rag build --pdfs {pdf_dir} --index test_attention --model colqwen2 --pages-dir test_pages"
|
||||
print(f" Command: {build_cmd}")
|
||||
|
||||
try:
|
||||
result = os.system(build_cmd)
|
||||
if result == 0:
|
||||
print("✅ Index built successfully!")
|
||||
else:
|
||||
print("❌ Index building failed")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"❌ Error building index: {e}")
|
||||
return
|
||||
|
||||
# Test search
|
||||
print("\n2️⃣ Testing search...")
|
||||
test_queries = [
|
||||
"How does attention mechanism work?",
|
||||
"What is the transformer architecture?",
|
||||
"How do you compute self-attention?",
|
||||
]
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\n🔍 Query: '{query}'")
|
||||
search_cmd = f'python -m apps.colqwen_rag search test_attention "{query}" --top-k 3'
|
||||
print(f" Command: {search_cmd}")
|
||||
|
||||
try:
|
||||
result = os.system(search_cmd)
|
||||
if result == 0:
|
||||
print("✅ Search completed")
|
||||
else:
|
||||
print("❌ Search failed")
|
||||
except Exception as e:
|
||||
print(f"❌ Search error: {e}")
|
||||
|
||||
# Test interactive mode (briefly)
|
||||
print("\n3️⃣ Testing interactive mode...")
|
||||
print(" You can test interactive mode with:")
|
||||
print(" python -m apps.colqwen_rag ask test_attention --interactive")
|
||||
|
||||
# Step 4: Test similarity maps (using existing script)
|
||||
print("\n4️⃣ Testing similarity maps...")
|
||||
similarity_script = (
|
||||
repo_root
|
||||
/ "apps"
|
||||
/ "multimodal"
|
||||
/ "vision-based-pdf-multi-vector"
|
||||
/ "multi-vector-leann-similarity-map.py"
|
||||
)
|
||||
|
||||
if similarity_script.exists():
|
||||
print(" You can generate similarity maps with:")
|
||||
print(f" cd {similarity_script.parent}")
|
||||
print(" python multi-vector-leann-similarity-map.py")
|
||||
print(" (Edit the script to use your local PDF)")
|
||||
|
||||
print("\n🎉 ColQwen reproduction test completed!")
|
||||
print("\n📋 Summary:")
|
||||
print(" ✅ Dependencies checked")
|
||||
print(" ✅ Sample PDF prepared")
|
||||
print(" ✅ Index building tested")
|
||||
print(" ✅ Search functionality tested")
|
||||
print(" ✅ Interactive mode available")
|
||||
print(" ✅ Similarity maps available")
|
||||
|
||||
print("\n🔗 Related repositories to check:")
|
||||
print(" - https://github.com/lightonai/fast-plaid")
|
||||
print(" - https://github.com/lightonai/pylate")
|
||||
print(" - https://github.com/stanford-futuredata/ColBERT")
|
||||
|
||||
print("\n📝 Next steps:")
|
||||
print(" 1. Test with your own PDFs")
|
||||
print(" 2. Experiment with different queries")
|
||||
print(" 3. Generate similarity maps for visual analysis")
|
||||
print(" 4. Compare ColQwen2 vs ColPali performance")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user