Compare commits

..

5 Commits

Author SHA1 Message Date
aakash
360fdf575c feat: Add ColQwen multimodal PDF retrieval integration 2025-12-19 13:54:38 -08:00
aakash
0175bc9c20 docs: Add ColQwen guide to docs directory
Add COLQWEN_GUIDE.md to docs/ directory for proper documentation structure.
This file is referenced in the README and needs to be tracked in git.
2025-12-07 09:57:14 -08:00
aakash
af47dfdde7 fix: Update ColQwen guide link to docs/ directory 2025-12-06 03:33:02 -08:00
aakash
f13bd02fbd docs: Add ColQwen multimodal PDF retrieval to README
Add brief introduction and usage guide for ColQwen integration,
similar to other RAG application sections in the README.

- Quick start examples for building, searching, and interactive Q&A
- Setup instructions with prerequisites
- Model options (ColQwen2 vs ColPali)
- Link to detailed ColQwen guide
2025-12-06 03:28:08 -08:00
aakash
86287d8832 Revert unnecessary faiss submodule update
Reset faiss submodule to match main branch to avoid unnecessary changes
2025-12-03 18:32:04 -08:00
5 changed files with 58 additions and 195 deletions

View File

@@ -379,6 +379,54 @@ python -m apps.code_rag --repo-dir "./my_codebase" --query "How does authenticat
</details>
### 🎨 ColQwen: Multimodal PDF Retrieval with Vision-Language Models
Search through PDFs using both text and visual understanding with ColQwen2/ColPali models. Perfect for research papers, technical documents, and any PDFs with complex layouts, figures, or diagrams.
> **🍎 Mac Users**: ColQwen is optimized for Apple Silicon with MPS acceleration for faster inference!
```bash
# Build index from PDFs
python -m apps.colqwen_rag build --pdfs ./my_papers/ --index research_papers
# Search with text queries
python -m apps.colqwen_rag search research_papers "How does attention mechanism work?"
# Interactive Q&A
python -m apps.colqwen_rag ask research_papers --interactive
```
<details>
<summary><strong>📋 Click to expand: ColQwen Setup & Usage</strong></summary>
#### Prerequisites
```bash
# Install dependencies
uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn
brew install poppler # macOS only, for PDF processing
```
#### Build Index
```bash
python -m apps.colqwen_rag build \
--pdfs ./pdf_directory/ \
--index my_index \
--model colqwen2 # or colpali
```
#### Search
```bash
python -m apps.colqwen_rag search my_index "your question here" --top-k 5
```
#### Models
- **ColQwen2** (`colqwen2`): Latest vision-language model with improved performance
- **ColPali** (`colpali`): Proven multimodal retriever
For detailed usage, see the [ColQwen Guide](docs/COLQWEN_GUIDE.md).
</details>
### 📧 Your Personal Email Secretary: RAG on Apple Mail!
> **Note:** The examples below currently support macOS only. Windows support coming soon.

View File

@@ -60,20 +60,6 @@ python -m apps.colqwen_rag ask my_index --interactive
- `help`: Show available commands
- `quit`/`exit`/`q`: Exit interactive mode
## 🧪 Test & Reproduce Results
Run the reproduction test for issue #119:
```bash
python test_colqwen_reproduction.py
```
This will:
1. ✅ Check dependencies
2. 📥 Download sample PDF (Attention Is All You Need paper)
3. 🏗️ Build test index
4. 🔍 Run sample queries
5. 📊 Show how to generate similarity maps
## 🎨 Advanced: Similarity Maps
For visual similarity analysis, use the existing advanced script:

View File

@@ -1162,11 +1162,6 @@ Examples:
print(f"Warning: Could not process {file_path}: {e}")
# Load other file types with default reader
# Exclude PDFs from code_extensions if they were already processed separately
other_file_extensions = code_extensions
if should_process_pdfs and ".pdf" in code_extensions:
other_file_extensions = [ext for ext in code_extensions if ext != ".pdf"]
try:
# Create a custom file filter function using our PathSpec
def file_filter(
@@ -1182,19 +1177,15 @@ Examples:
except (ValueError, OSError):
return True # Include files that can't be processed
# Only load other file types if there are extensions to process
if other_file_extensions:
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=other_file_extensions,
file_extractor={}, # Use default extractors
exclude_hidden=not include_hidden,
filename_as_id=True,
).load_data(show_progress=True)
else:
other_docs = []
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=code_extensions,
file_extractor={}, # Use default extractors
exclude_hidden=not include_hidden,
filename_as_id=True,
).load_data(show_progress=True)
# Filter documents after loading based on gitignore rules
filtered_docs = []

View File

@@ -1,162 +0,0 @@
#!/usr/bin/env python3
"""
Test script to reproduce ColQwen results from issue #119
https://github.com/yichuan-w/LEANN/issues/119
This script demonstrates the ColQwen workflow:
1. Download sample PDF
2. Convert to images
3. Build multimodal index
4. Run test queries
5. Generate similarity maps
"""
import importlib.util
import os
from pathlib import Path
def main():
print("🧪 ColQwen Reproduction Test - Issue #119")
print("=" * 50)
# Check if we're in the right directory
repo_root = Path.cwd()
if not (repo_root / "apps" / "colqwen_rag.py").exists():
print("❌ Please run this script from the LEANN repository root")
print(" cd /path/to/LEANN && python test_colqwen_reproduction.py")
return
print("✅ Repository structure looks good")
# Step 1: Check dependencies
print("\n📦 Checking dependencies...")
try:
import torch
# Check if pdf2image is available
if importlib.util.find_spec("pdf2image") is None:
raise ImportError("pdf2image not found")
# Check if colpali_engine is available
if importlib.util.find_spec("colpali_engine") is None:
raise ImportError("colpali_engine not found")
print("✅ Core dependencies available")
print(f" - PyTorch: {torch.__version__}")
print(f" - CUDA available: {torch.cuda.is_available()}")
print(
f" - MPS available: {hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()}"
)
except ImportError as e:
print(f"❌ Missing dependency: {e}")
print("\n📥 Install missing dependencies:")
print(
" uv pip install colpali_engine pdf2image pillow matplotlib qwen_vl_utils einops seaborn"
)
return
# Step 2: Download sample PDF
print("\n📄 Setting up sample PDF...")
pdf_dir = repo_root / "test_pdfs"
pdf_dir.mkdir(exist_ok=True)
sample_pdf = pdf_dir / "attention_paper.pdf"
if not sample_pdf.exists():
print("📥 Downloading sample paper (Attention Is All You Need)...")
import urllib.request
try:
urllib.request.urlretrieve("https://arxiv.org/pdf/1706.03762.pdf", sample_pdf)
print(f"✅ Downloaded: {sample_pdf}")
except Exception as e:
print(f"❌ Download failed: {e}")
print(" Please manually download a PDF to test_pdfs/attention_paper.pdf")
return
else:
print(f"✅ Using existing PDF: {sample_pdf}")
# Step 3: Test ColQwen RAG
print("\n🚀 Testing ColQwen RAG...")
# Build index
print("\n1⃣ Building multimodal index...")
build_cmd = f"python -m apps.colqwen_rag build --pdfs {pdf_dir} --index test_attention --model colqwen2 --pages-dir test_pages"
print(f" Command: {build_cmd}")
try:
result = os.system(build_cmd)
if result == 0:
print("✅ Index built successfully!")
else:
print("❌ Index building failed")
return
except Exception as e:
print(f"❌ Error building index: {e}")
return
# Test search
print("\n2⃣ Testing search...")
test_queries = [
"How does attention mechanism work?",
"What is the transformer architecture?",
"How do you compute self-attention?",
]
for query in test_queries:
print(f"\n🔍 Query: '{query}'")
search_cmd = f'python -m apps.colqwen_rag search test_attention "{query}" --top-k 3'
print(f" Command: {search_cmd}")
try:
result = os.system(search_cmd)
if result == 0:
print("✅ Search completed")
else:
print("❌ Search failed")
except Exception as e:
print(f"❌ Search error: {e}")
# Test interactive mode (briefly)
print("\n3⃣ Testing interactive mode...")
print(" You can test interactive mode with:")
print(" python -m apps.colqwen_rag ask test_attention --interactive")
# Step 4: Test similarity maps (using existing script)
print("\n4⃣ Testing similarity maps...")
similarity_script = (
repo_root
/ "apps"
/ "multimodal"
/ "vision-based-pdf-multi-vector"
/ "multi-vector-leann-similarity-map.py"
)
if similarity_script.exists():
print(" You can generate similarity maps with:")
print(f" cd {similarity_script.parent}")
print(" python multi-vector-leann-similarity-map.py")
print(" (Edit the script to use your local PDF)")
print("\n🎉 ColQwen reproduction test completed!")
print("\n📋 Summary:")
print(" ✅ Dependencies checked")
print(" ✅ Sample PDF prepared")
print(" ✅ Index building tested")
print(" ✅ Search functionality tested")
print(" ✅ Interactive mode available")
print(" ✅ Similarity maps available")
print("\n🔗 Related repositories to check:")
print(" - https://github.com/lightonai/fast-plaid")
print(" - https://github.com/lightonai/pylate")
print(" - https://github.com/stanford-futuredata/ColBERT")
print("\n📝 Next steps:")
print(" 1. Test with your own PDFs")
print(" 2. Experiment with different queries")
print(" 3. Generate similarity maps for visual analysis")
print(" 4. Compare ColQwen2 vs ColPali performance")
if __name__ == "__main__":
main()