Compare commits

..

1 Commits

Author SHA1 Message Date
aakash
697d247698 fix security vulnerability: replace eval() 2025-11-13 11:12:31 -08:00
6 changed files with 182 additions and 271 deletions

View File

@@ -60,6 +60,20 @@ python -m apps.colqwen_rag ask my_index --interactive
- `help`: Show available commands - `help`: Show available commands
- `quit`/`exit`/`q`: Exit interactive mode - `quit`/`exit`/`q`: Exit interactive mode
## 🧪 Test & Reproduce Results
Run the reproduction test for issue #119:
```bash
python test_colqwen_reproduction.py
```
This will:
1. ✅ Check dependencies
2. 📥 Download sample PDF (Attention Is All You Need paper)
3. 🏗️ Build test index
4. 🔍 Run sample queries
5. 📊 Show how to generate similarity maps
## 🎨 Advanced: Similarity Maps ## 🎨 Advanced: Similarity Maps
For visual similarity analysis, use the existing advanced script: For visual similarity analysis, use the existing advanced script:

View File

@@ -379,54 +379,6 @@ python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authenticat
</details> </details>
### 🎨 ColQwen: Multimodal PDF Retrieval with Vision-Language Models
Search through PDFs using both text and visual understanding with ColQwen2/ColPali models. Perfect for research papers, technical documents, and any PDFs with complex layouts, figures, or diagrams.
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
```bash
# Build index from PDFs
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
# Search with text queries
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
# Interactive Q&A
python -m apps.colqwen_rag ask research_papers --interactive
```
<details>
<summary><strong>📋 Click to expand: ColQwen Setup & Usage</strong></summary>
#### Prerequisites
```bash
# Install dependencies
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
brew install poppler # macOS only, for PDF processing
```
#### Build Index
```bash
python -m apps.colqwen_rag build \
--pdfs ./pdf_directory/ \
--index my_index \
--model colqwen2 # or colpali
```
#### Search
```bash
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
```
#### Models
- **ColQwen2** (`colqwen2`): Latest vision-language model with improved performance
- **ColPali** (`colpali`): Proven multimodal retriever
For detailed usage, see the [ColQwen Guide](docs/COLQWEN_GUIDE.md).
</details>
### 📧 Your Personal Email Secretary: RAG on Apple Mail! ### 📧 Your Personal Email Secretary: RAG on Apple Mail!
> **Note:** The examples below currently support macOS only. Windows support coming soon. > **Note:** The examples below currently support macOS only. Windows support coming soon.

View File

@@ -1,218 +0,0 @@
#!/usr/bin/env python3
"""
CLIP Image RAG Application
This application enables RAG (Retrieval-Augmented Generation) on images using CLIP embeddings.
You can index a directory of images and search them using text queries.
Usage:
python -m apps.image_rag --image-dir ./my_images/ --query "a sunset over mountains"
python -m apps.image_rag --image-dir ./my_images/ --interactive
"""
import argparse
import pickle
import tempfile
from pathlib import Path
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from apps.base_rag_example import BaseRAGExample
class ImageRAG(BaseRAGExample):
"""
RAG application for images using CLIP embeddings.
This class provides a complete RAG pipeline for image data, including
CLIP embedding generation, indexing, and text-based image search.
"""
def __init__(self):
super().__init__(
name="Image RAG",
description="RAG application for images using CLIP embeddings",
default_index_name="image_index",
)
# Override default embedding model to use CLIP
self.embedding_model_default = "clip-ViT-L-14"
self.embedding_mode_default = "sentence-transformers"
self._image_data: list[dict] = []
def _add_specific_arguments(self, parser: argparse.ArgumentParser):
"""Add image-specific arguments."""
image_group = parser.add_argument_group("Image Parameters")
image_group.add_argument(
"--image-dir",
type=str,
required=True,
help="Directory containing images to index",
)
image_group.add_argument(
"--image-extensions",
type=str,
nargs="+",
default=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"],
help="Image file extensions to process (default: .jpg .jpeg .png .gif .bmp .webp)",
)
image_group.add_argument(
"--batch-size",
type=int,
default=32,
help="Batch size for CLIP embedding generation (default: 32)",
)
async def load_data(self, args) -> list[str]:
"""Load images, generate CLIP embeddings, and return text descriptions."""
self._image_data = self._load_images_and_embeddings(args)
return [entry["text"] for entry in self._image_data]
def _load_images_and_embeddings(self, args) -> list[dict]:
"""Helper to process images and produce embeddings/metadata."""
image_dir = Path(args.image_dir)
if not image_dir.exists():
raise ValueError(f"Image directory does not exist: {image_dir}")
print(f"📸 Loading images from {image_dir}...")
# Find all image files
image_files = []
for ext in args.image_extensions:
image_files.extend(image_dir.rglob(f"*{ext}"))
image_files.extend(image_dir.rglob(f"*{ext.upper()}"))
if not image_files:
raise ValueError(
f"No images found in {image_dir} with extensions {args.image_extensions}"
)
print(f"✅ Found {len(image_files)} images")
# Limit if max_items is set
if args.max_items > 0:
image_files = image_files[: args.max_items]
print(f"📊 Processing {len(image_files)} images (limited by --max-items)")
# Load CLIP model
print("🔍 Loading CLIP model...")
model = SentenceTransformer(self.embedding_model_default)
# Process images and generate embeddings
print("🖼️ Processing images and generating embeddings...")
image_data = []
batch_images = []
batch_paths = []
for image_path in tqdm(image_files, desc="Processing images"):
try:
image = Image.open(image_path).convert("RGB")
batch_images.append(image)
batch_paths.append(image_path)
# Process in batches
if len(batch_images) >= args.batch_size:
embeddings = model.encode(
batch_images,
convert_to_numpy=True,
normalize_embeddings=True,
batch_size=args.batch_size,
show_progress_bar=False,
)
for img_path, embedding in zip(batch_paths, embeddings):
image_data.append(
{
"text": f"Image: {img_path.name}\nPath: {img_path}",
"metadata": {
"image_path": str(img_path),
"image_name": img_path.name,
"image_dir": str(image_dir),
},
"embedding": embedding.astype(np.float32),
}
)
batch_images = []
batch_paths = []
except Exception as e:
print(f"⚠️ Failed to process {image_path}: {e}")
continue
# Process remaining images
if batch_images:
embeddings = model.encode(
batch_images,
convert_to_numpy=True,
normalize_embeddings=True,
batch_size=len(batch_images),
show_progress_bar=False,
)
for img_path, embedding in zip(batch_paths, embeddings):
image_data.append(
{
"text": f"Image: {img_path.name}\nPath: {img_path}",
"metadata": {
"image_path": str(img_path),
"image_name": img_path.name,
"image_dir": str(image_dir),
},
"embedding": embedding.astype(np.float32),
}
)
print(f"✅ Processed {len(image_data)} images")
return image_data
async def build_index(self, args, texts: list[str]) -> str:
"""Build index using pre-computed CLIP embeddings."""
from leann.api import LeannBuilder
if not self._image_data or len(self._image_data) != len(texts):
raise RuntimeError("No image data found. Make sure load_data() ran successfully.")
print("🔨 Building LEANN index with CLIP embeddings...")
builder = LeannBuilder(
backend_name=args.backend_name,
embedding_model=self.embedding_model_default,
embedding_mode=self.embedding_mode_default,
is_recompute=False,
distance_metric="cosine",
graph_degree=args.graph_degree,
build_complexity=args.build_complexity,
is_compact=not args.no_compact,
)
for text, data in zip(texts, self._image_data):
builder.add_text(text=text, metadata=data["metadata"])
ids = [str(i) for i in range(len(self._image_data))]
embeddings = np.array([data["embedding"] for data in self._image_data], dtype=np.float32)
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pkl", delete=False) as f:
pickle.dump((ids, embeddings), f)
pkl_path = f.name
try:
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
builder.build_index_from_embeddings(index_path, pkl_path)
print(f"✅ Index built successfully at {index_path}")
return index_path
finally:
Path(pkl_path).unlink()
def main():
"""Main entry point for the image RAG application."""
import asyncio
app = ImageRAG()
asyncio.run(app.run())
if __name__ == "__main__":
main()

View File

@@ -7,6 +7,7 @@ for indexing in LEANN. It supports various Slack MCP server implementations and
flexible message processing options. flexible message processing options.
""" """
import ast
import asyncio import asyncio
import json import json
import logging import logging
@@ -146,16 +147,16 @@ class SlackMCPReader:
match = re.search(r"'error':\s*(\{[^}]+\})", str(e)) match = re.search(r"'error':\s*(\{[^}]+\})", str(e))
if match: if match:
try: try:
error_dict = eval(match.group(1)) error_dict = ast.literal_eval(match.group(1))
except (ValueError, SyntaxError, NameError): except (ValueError, SyntaxError):
pass pass
else: else:
# Try alternative format # Try alternative format
match = re.search(r"Failed to fetch messages:\s*(\{[^}]+\})", str(e)) match = re.search(r"Failed to fetch messages:\s*(\{[^}]+\})", str(e))
if match: if match:
try: try:
error_dict = eval(match.group(1)) error_dict = ast.literal_eval(match.group(1))
except (ValueError, SyntaxError, NameError): except (ValueError, SyntaxError):
pass pass
if self._is_cache_sync_error(error_dict): if self._is_cache_sync_error(error_dict):

View File

@@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""
Test script to reproduce ColQwen results from issue #119
https://github.com/yichuan-w/LEANN/issues/119
This script demonstrates the ColQwen workflow:
1. Download sample PDF
2. Convert to images
3. Build multimodal index
4. Run test queries
5. Generate similarity maps
"""
import importlib.util
import os
from pathlib import Path
def main():
print("🧪 ColQwen Reproduction Test - Issue #119")
print("=" * 50)
# Check if we're in the right directory
repo_root = Path.cwd()
if not (repo_root / "apps" / "colqwen_rag.py").exists():
print("❌ Please run this script from the LEANN repository root")
print(" cd /path/to/LEANN && python test_colqwen_reproduction.py")
return
print("✅ Repository structure looks good")
# Step 1: Check dependencies
print("\n📦 Checking dependencies...")
try:
import torch
# Check if pdf2image is available
if importlib.util.find_spec("pdf2image") is None:
raise ImportError("pdf2image not found")
# Check if colpali_engine is available
if importlib.util.find_spec("colpali_engine") is None:
raise ImportError("colpali_engine not found")
print("✅ Core dependencies available")
print(f" - PyTorch: {torch.__version__}")
print(f" - CUDA available: {torch.cuda.is_available()}")
print(
f" - MPS available: {hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()}"
)
except ImportError as e:
print(f"❌ Missing dependency: {e}")
print("\n📥 Install missing dependencies:")
print(
" uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn"
)
return
# Step 2: Download sample PDF
print("\n📄 Setting up sample PDF...")
pdf_dir = repo_root / "test_pdfs"
pdf_dir.mkdir(exist_ok=True)
sample_pdf = pdf_dir / "attention_paper.pdf"
if not sample_pdf.exists():
print("📥 Downloading sample paper (Attention Is All You Need)...")
import urllib.request
try:
urllib.request.urlretrieve("https://arxiv.org/pdf/1706.03762.pdf", sample_pdf)
print(f"✅ Downloaded: {sample_pdf}")
except Exception as e:
print(f"❌ Download failed: {e}")
print(" Please manually download a PDF to test_pdfs/attention_paper.pdf")
return
else:
print(f"✅ Using existing PDF: {sample_pdf}")
# Step 3: Test ColQwen RAG
print("\n🚀 Testing ColQwen RAG...")
# Build index
print("\n1⃣ Building multimodal index...")
build_cmd = f"python -m apps.colqwen_rag build --pdfs {pdf_dir} --index test_attention --model colqwen2 --pages-dir test_pages"
print(f" Command: {build_cmd}")
try:
result = os.system(build_cmd)
if result == 0:
print("✅ Index built successfully!")
else:
print("❌ Index building failed")
return
except Exception as e:
print(f"❌ Error building index: {e}")
return
# Test search
print("\n2⃣ Testing search...")
test_queries = [
"How does attention mechanism work?",
"What is the transformer architecture?",
"How do you compute self-attention?",
]
for query in test_queries:
print(f"\n🔍 Query: '{query}'")
search_cmd = f'python -m apps.colqwen_rag search test_attention "{query}" --top-k 3'
print(f" Command: {search_cmd}")
try:
result = os.system(search_cmd)
if result == 0:
print("✅ Search completed")
else:
print("❌ Search failed")
except Exception as e:
print(f"❌ Search error: {e}")
# Test interactive mode (briefly)
print("\n3⃣ Testing interactive mode...")
print(" You can test interactive mode with:")
print(" python -m apps.colqwen_rag ask test_attention --interactive")
# Step 4: Test similarity maps (using existing script)
print("\n4⃣ Testing similarity maps...")
similarity_script = (
repo_root
/ "apps"
/ "multimodal"
/ "vision-based-pdf-multi-vector"
/ "multi-vector-leann-similarity-map.py"
)
if similarity_script.exists():
print(" You can generate similarity maps with:")
print(f" cd {similarity_script.parent}")
print(" python multi-vector-leann-similarity-map.py")
print(" (Edit the script to use your local PDF)")
print("\n🎉 ColQwen reproduction test completed!")
print("\n📋 Summary:")
print(" ✅ Dependencies checked")
print(" ✅ Sample PDF prepared")
print(" ✅ Index building tested")
print(" ✅ Search functionality tested")
print(" ✅ Interactive mode available")
print(" ✅ Similarity maps available")
print("\n🔗 Related repositories to check:")
print(" - https://github.com/lightonai/fast-plaid")
print(" - https://github.com/lightonai/pylate")
print(" - https://github.com/stanford-futuredata/ColBERT")
print("\n📝 Next steps:")
print(" 1. Test with your own PDFs")
print(" 2. Experiment with different queries")
print(" 3. Generate similarity maps for visual analysis")
print(" 4. Compare ColQwen2 vs ColPali performance")
if __name__ == "__main__":
main()