Compare commits
1 Commits
feature/co
...
fix/securi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
697d247698 |
@@ -60,6 +60,20 @@ python -m apps.colqwen_rag ask my_index --interactive
|
|||||||
- `help`: Show available commands
|
- `help`: Show available commands
|
||||||
- `quit`/`exit`/`q`: Exit interactive mode
|
- `quit`/`exit`/`q`: Exit interactive mode
|
||||||
|
|
||||||
|
## 🧪 Test & Reproduce Results
|
||||||
|
|
||||||
|
Run the reproduction test for issue #119:
|
||||||
|
```bash
|
||||||
|
python test_colqwen_reproduction.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
1. ✅ Check dependencies
|
||||||
|
2. 📥 Download sample PDF (Attention Is All You Need paper)
|
||||||
|
3. 🏗️ Build test index
|
||||||
|
4. 🔍 Run sample queries
|
||||||
|
5. 📊 Show how to generate similarity maps
|
||||||
|
|
||||||
## 🎨 Advanced: Similarity Maps
|
## 🎨 Advanced: Similarity Maps
|
||||||
|
|
||||||
For visual similarity analysis, use the existing advanced script:
|
For visual similarity analysis, use the existing advanced script:
|
||||||
48
README.md
48
README.md
@@ -379,54 +379,6 @@ python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authenticat
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
### 🎨 ColQwen: Multimodal PDF Retrieval with Vision-Language Models
|
|
||||||
|
|
||||||
Search through PDFs using both text and visual understanding with ColQwen2/ColPali models. Perfect for research papers, technical documents, and any PDFs with complex layouts, figures, or diagrams.
|
|
||||||
|
|
||||||
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Build index from PDFs
|
|
||||||
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
|
|
||||||
|
|
||||||
# Search with text queries
|
|
||||||
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
|
|
||||||
|
|
||||||
# Interactive Q&A
|
|
||||||
python -m apps.colqwen_rag ask research_papers --interactive
|
|
||||||
```
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary><strong>📋 Click to expand: ColQwen Setup & Usage</strong></summary>
|
|
||||||
|
|
||||||
#### Prerequisites
|
|
||||||
```bash
|
|
||||||
# Install dependencies
|
|
||||||
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
|
|
||||||
brew install poppler # macOS only, for PDF processing
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Build Index
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag build \
|
|
||||||
--pdfs ./pdf_directory/ \
|
|
||||||
--index my_index \
|
|
||||||
--model colqwen2 # or colpali
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Search
|
|
||||||
```bash
|
|
||||||
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Models
|
|
||||||
- **ColQwen2** (`colqwen2`): Latest vision-language model with improved performance
|
|
||||||
- **ColPali** (`colpali`): Proven multimodal retriever
|
|
||||||
|
|
||||||
For detailed usage, see the [ColQwen Guide](docs/COLQWEN_GUIDE.md).
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
|
||||||
|
|
||||||
> **Note:** The examples below currently support macOS only. Windows support coming soon.
|
> **Note:** The examples below currently support macOS only. Windows support coming soon.
|
||||||
|
|||||||
@@ -1,218 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
CLIP Image RAG Application
|
|
||||||
|
|
||||||
This application enables RAG (Retrieval-Augmented Generation) on images using CLIP embeddings.
|
|
||||||
You can index a directory of images and search them using text queries.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python -m apps.image_rag --image-dir ./my_images/ --query "a sunset over mountains"
|
|
||||||
python -m apps.image_rag --image-dir ./my_images/ --interactive
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import pickle
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from apps.base_rag_example import BaseRAGExample
|
|
||||||
|
|
||||||
|
|
||||||
class ImageRAG(BaseRAGExample):
|
|
||||||
"""
|
|
||||||
RAG application for images using CLIP embeddings.
|
|
||||||
|
|
||||||
This class provides a complete RAG pipeline for image data, including
|
|
||||||
CLIP embedding generation, indexing, and text-based image search.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__(
|
|
||||||
name="Image RAG",
|
|
||||||
description="RAG application for images using CLIP embeddings",
|
|
||||||
default_index_name="image_index",
|
|
||||||
)
|
|
||||||
# Override default embedding model to use CLIP
|
|
||||||
self.embedding_model_default = "clip-ViT-L-14"
|
|
||||||
self.embedding_mode_default = "sentence-transformers"
|
|
||||||
self._image_data: list[dict] = []
|
|
||||||
|
|
||||||
def _add_specific_arguments(self, parser: argparse.ArgumentParser):
|
|
||||||
"""Add image-specific arguments."""
|
|
||||||
image_group = parser.add_argument_group("Image Parameters")
|
|
||||||
image_group.add_argument(
|
|
||||||
"--image-dir",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory containing images to index",
|
|
||||||
)
|
|
||||||
image_group.add_argument(
|
|
||||||
"--image-extensions",
|
|
||||||
type=str,
|
|
||||||
nargs="+",
|
|
||||||
default=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"],
|
|
||||||
help="Image file extensions to process (default: .jpg .jpeg .png .gif .bmp .webp)",
|
|
||||||
)
|
|
||||||
image_group.add_argument(
|
|
||||||
"--batch-size",
|
|
||||||
type=int,
|
|
||||||
default=32,
|
|
||||||
help="Batch size for CLIP embedding generation (default: 32)",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def load_data(self, args) -> list[str]:
|
|
||||||
"""Load images, generate CLIP embeddings, and return text descriptions."""
|
|
||||||
self._image_data = self._load_images_and_embeddings(args)
|
|
||||||
return [entry["text"] for entry in self._image_data]
|
|
||||||
|
|
||||||
def _load_images_and_embeddings(self, args) -> list[dict]:
|
|
||||||
"""Helper to process images and produce embeddings/metadata."""
|
|
||||||
image_dir = Path(args.image_dir)
|
|
||||||
if not image_dir.exists():
|
|
||||||
raise ValueError(f"Image directory does not exist: {image_dir}")
|
|
||||||
|
|
||||||
print(f"📸 Loading images from {image_dir}...")
|
|
||||||
|
|
||||||
# Find all image files
|
|
||||||
image_files = []
|
|
||||||
for ext in args.image_extensions:
|
|
||||||
image_files.extend(image_dir.rglob(f"*{ext}"))
|
|
||||||
image_files.extend(image_dir.rglob(f"*{ext.upper()}"))
|
|
||||||
|
|
||||||
if not image_files:
|
|
||||||
raise ValueError(
|
|
||||||
f"No images found in {image_dir} with extensions {args.image_extensions}"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"✅ Found {len(image_files)} images")
|
|
||||||
|
|
||||||
# Limit if max_items is set
|
|
||||||
if args.max_items > 0:
|
|
||||||
image_files = image_files[: args.max_items]
|
|
||||||
print(f"📊 Processing {len(image_files)} images (limited by --max-items)")
|
|
||||||
|
|
||||||
# Load CLIP model
|
|
||||||
print("🔍 Loading CLIP model...")
|
|
||||||
model = SentenceTransformer(self.embedding_model_default)
|
|
||||||
|
|
||||||
# Process images and generate embeddings
|
|
||||||
print("🖼️ Processing images and generating embeddings...")
|
|
||||||
image_data = []
|
|
||||||
batch_images = []
|
|
||||||
batch_paths = []
|
|
||||||
|
|
||||||
for image_path in tqdm(image_files, desc="Processing images"):
|
|
||||||
try:
|
|
||||||
image = Image.open(image_path).convert("RGB")
|
|
||||||
batch_images.append(image)
|
|
||||||
batch_paths.append(image_path)
|
|
||||||
|
|
||||||
# Process in batches
|
|
||||||
if len(batch_images) >= args.batch_size:
|
|
||||||
embeddings = model.encode(
|
|
||||||
batch_images,
|
|
||||||
convert_to_numpy=True,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
batch_size=args.batch_size,
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
for img_path, embedding in zip(batch_paths, embeddings):
|
|
||||||
image_data.append(
|
|
||||||
{
|
|
||||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
|
||||||
"metadata": {
|
|
||||||
"image_path": str(img_path),
|
|
||||||
"image_name": img_path.name,
|
|
||||||
"image_dir": str(image_dir),
|
|
||||||
},
|
|
||||||
"embedding": embedding.astype(np.float32),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
batch_images = []
|
|
||||||
batch_paths = []
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ Failed to process {image_path}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Process remaining images
|
|
||||||
if batch_images:
|
|
||||||
embeddings = model.encode(
|
|
||||||
batch_images,
|
|
||||||
convert_to_numpy=True,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
batch_size=len(batch_images),
|
|
||||||
show_progress_bar=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
for img_path, embedding in zip(batch_paths, embeddings):
|
|
||||||
image_data.append(
|
|
||||||
{
|
|
||||||
"text": f"Image: {img_path.name}\nPath: {img_path}",
|
|
||||||
"metadata": {
|
|
||||||
"image_path": str(img_path),
|
|
||||||
"image_name": img_path.name,
|
|
||||||
"image_dir": str(image_dir),
|
|
||||||
},
|
|
||||||
"embedding": embedding.astype(np.float32),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"✅ Processed {len(image_data)} images")
|
|
||||||
return image_data
|
|
||||||
|
|
||||||
async def build_index(self, args, texts: list[str]) -> str:
|
|
||||||
"""Build index using pre-computed CLIP embeddings."""
|
|
||||||
from leann.api import LeannBuilder
|
|
||||||
|
|
||||||
if not self._image_data or len(self._image_data) != len(texts):
|
|
||||||
raise RuntimeError("No image data found. Make sure load_data() ran successfully.")
|
|
||||||
|
|
||||||
print("🔨 Building LEANN index with CLIP embeddings...")
|
|
||||||
builder = LeannBuilder(
|
|
||||||
backend_name=args.backend_name,
|
|
||||||
embedding_model=self.embedding_model_default,
|
|
||||||
embedding_mode=self.embedding_mode_default,
|
|
||||||
is_recompute=False,
|
|
||||||
distance_metric="cosine",
|
|
||||||
graph_degree=args.graph_degree,
|
|
||||||
build_complexity=args.build_complexity,
|
|
||||||
is_compact=not args.no_compact,
|
|
||||||
)
|
|
||||||
|
|
||||||
for text, data in zip(texts, self._image_data):
|
|
||||||
builder.add_text(text=text, metadata=data["metadata"])
|
|
||||||
|
|
||||||
ids = [str(i) for i in range(len(self._image_data))]
|
|
||||||
embeddings = np.array([data["embedding"] for data in self._image_data], dtype=np.float32)
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pkl", delete=False) as f:
|
|
||||||
pickle.dump((ids, embeddings), f)
|
|
||||||
pkl_path = f.name
|
|
||||||
|
|
||||||
try:
|
|
||||||
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
|
|
||||||
builder.build_index_from_embeddings(index_path, pkl_path)
|
|
||||||
print(f"✅ Index built successfully at {index_path}")
|
|
||||||
return index_path
|
|
||||||
finally:
|
|
||||||
Path(pkl_path).unlink()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point for the image RAG application."""
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
app = ImageRAG()
|
|
||||||
asyncio.run(app.run())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -7,6 +7,7 @@ for indexing in LEANN. It supports various Slack MCP server implementations and
|
|||||||
flexible message processing options.
|
flexible message processing options.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -146,16 +147,16 @@ class SlackMCPReader:
|
|||||||
match = re.search(r"'error':\s*(\{[^}]+\})", str(e))
|
match = re.search(r"'error':\s*(\{[^}]+\})", str(e))
|
||||||
if match:
|
if match:
|
||||||
try:
|
try:
|
||||||
error_dict = eval(match.group(1))
|
error_dict = ast.literal_eval(match.group(1))
|
||||||
except (ValueError, SyntaxError, NameError):
|
except (ValueError, SyntaxError):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# Try alternative format
|
# Try alternative format
|
||||||
match = re.search(r"Failed to fetch messages:\s*(\{[^}]+\})", str(e))
|
match = re.search(r"Failed to fetch messages:\s*(\{[^}]+\})", str(e))
|
||||||
if match:
|
if match:
|
||||||
try:
|
try:
|
||||||
error_dict = eval(match.group(1))
|
error_dict = ast.literal_eval(match.group(1))
|
||||||
except (ValueError, SyntaxError, NameError):
|
except (ValueError, SyntaxError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if self._is_cache_sync_error(error_dict):
|
if self._is_cache_sync_error(error_dict):
|
||||||
|
|||||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: e2d243c40d...5952745237
162
test_colqwen_reproduction.py
Normal file
162
test_colqwen_reproduction.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to reproduce ColQwen results from issue #119
|
||||||
|
https://github.com/yichuan-w/LEANN/issues/119
|
||||||
|
|
||||||
|
This script demonstrates the ColQwen workflow:
|
||||||
|
1. Download sample PDF
|
||||||
|
2. Convert to images
|
||||||
|
3. Build multimodal index
|
||||||
|
4. Run test queries
|
||||||
|
5. Generate similarity maps
|
||||||
|
"""
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("🧪 ColQwen Reproduction Test - Issue #119")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Check if we're in the right directory
|
||||||
|
repo_root = Path.cwd()
|
||||||
|
if not (repo_root / "apps" / "colqwen_rag.py").exists():
|
||||||
|
print("❌ Please run this script from the LEANN repository root")
|
||||||
|
print(" cd /path/to/LEANN && python test_colqwen_reproduction.py")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✅ Repository structure looks good")
|
||||||
|
|
||||||
|
# Step 1: Check dependencies
|
||||||
|
print("\n📦 Checking dependencies...")
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# Check if pdf2image is available
|
||||||
|
if importlib.util.find_spec("pdf2image") is None:
|
||||||
|
raise ImportError("pdf2image not found")
|
||||||
|
# Check if colpali_engine is available
|
||||||
|
if importlib.util.find_spec("colpali_engine") is None:
|
||||||
|
raise ImportError("colpali_engine not found")
|
||||||
|
|
||||||
|
print("✅ Core dependencies available")
|
||||||
|
print(f" - PyTorch: {torch.__version__}")
|
||||||
|
print(f" - CUDA available: {torch.cuda.is_available()}")
|
||||||
|
print(
|
||||||
|
f" - MPS available: {hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()}"
|
||||||
|
)
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"❌ Missing dependency: {e}")
|
||||||
|
print("\n📥 Install missing dependencies:")
|
||||||
|
print(
|
||||||
|
" uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2: Download sample PDF
|
||||||
|
print("\n📄 Setting up sample PDF...")
|
||||||
|
pdf_dir = repo_root / "test_pdfs"
|
||||||
|
pdf_dir.mkdir(exist_ok=True)
|
||||||
|
sample_pdf = pdf_dir / "attention_paper.pdf"
|
||||||
|
|
||||||
|
if not sample_pdf.exists():
|
||||||
|
print("📥 Downloading sample paper (Attention Is All You Need)...")
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
try:
|
||||||
|
urllib.request.urlretrieve("https://arxiv.org/pdf/1706.03762.pdf", sample_pdf)
|
||||||
|
print(f"✅ Downloaded: {sample_pdf}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Download failed: {e}")
|
||||||
|
print(" Please manually download a PDF to test_pdfs/attention_paper.pdf")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print(f"✅ Using existing PDF: {sample_pdf}")
|
||||||
|
|
||||||
|
# Step 3: Test ColQwen RAG
|
||||||
|
print("\n🚀 Testing ColQwen RAG...")
|
||||||
|
|
||||||
|
# Build index
|
||||||
|
print("\n1️⃣ Building multimodal index...")
|
||||||
|
build_cmd = f"python -m apps.colqwen_rag build --pdfs {pdf_dir} --index test_attention --model colqwen2 --pages-dir test_pages"
|
||||||
|
print(f" Command: {build_cmd}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = os.system(build_cmd)
|
||||||
|
if result == 0:
|
||||||
|
print("✅ Index built successfully!")
|
||||||
|
else:
|
||||||
|
print("❌ Index building failed")
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error building index: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Test search
|
||||||
|
print("\n2️⃣ Testing search...")
|
||||||
|
test_queries = [
|
||||||
|
"How does attention mechanism work?",
|
||||||
|
"What is the transformer architecture?",
|
||||||
|
"How do you compute self-attention?",
|
||||||
|
]
|
||||||
|
|
||||||
|
for query in test_queries:
|
||||||
|
print(f"\n🔍 Query: '{query}'")
|
||||||
|
search_cmd = f'python -m apps.colqwen_rag search test_attention "{query}" --top-k 3'
|
||||||
|
print(f" Command: {search_cmd}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = os.system(search_cmd)
|
||||||
|
if result == 0:
|
||||||
|
print("✅ Search completed")
|
||||||
|
else:
|
||||||
|
print("❌ Search failed")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Search error: {e}")
|
||||||
|
|
||||||
|
# Test interactive mode (briefly)
|
||||||
|
print("\n3️⃣ Testing interactive mode...")
|
||||||
|
print(" You can test interactive mode with:")
|
||||||
|
print(" python -m apps.colqwen_rag ask test_attention --interactive")
|
||||||
|
|
||||||
|
# Step 4: Test similarity maps (using existing script)
|
||||||
|
print("\n4️⃣ Testing similarity maps...")
|
||||||
|
similarity_script = (
|
||||||
|
repo_root
|
||||||
|
/ "apps"
|
||||||
|
/ "multimodal"
|
||||||
|
/ "vision-based-pdf-multi-vector"
|
||||||
|
/ "multi-vector-leann-similarity-map.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
if similarity_script.exists():
|
||||||
|
print(" You can generate similarity maps with:")
|
||||||
|
print(f" cd {similarity_script.parent}")
|
||||||
|
print(" python multi-vector-leann-similarity-map.py")
|
||||||
|
print(" (Edit the script to use your local PDF)")
|
||||||
|
|
||||||
|
print("\n🎉 ColQwen reproduction test completed!")
|
||||||
|
print("\n📋 Summary:")
|
||||||
|
print(" ✅ Dependencies checked")
|
||||||
|
print(" ✅ Sample PDF prepared")
|
||||||
|
print(" ✅ Index building tested")
|
||||||
|
print(" ✅ Search functionality tested")
|
||||||
|
print(" ✅ Interactive mode available")
|
||||||
|
print(" ✅ Similarity maps available")
|
||||||
|
|
||||||
|
print("\n🔗 Related repositories to check:")
|
||||||
|
print(" - https://github.com/lightonai/fast-plaid")
|
||||||
|
print(" - https://github.com/lightonai/pylate")
|
||||||
|
print(" - https://github.com/stanford-futuredata/ColBERT")
|
||||||
|
|
||||||
|
print("\n📝 Next steps:")
|
||||||
|
print(" 1. Test with your own PDFs")
|
||||||
|
print(" 2. Experiment with different queries")
|
||||||
|
print(" 3. Generate similarity maps for visual analysis")
|
||||||
|
print(" 4. Compare ColQwen2 vs ColPali performance")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user