diff --git a/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py b/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py index 2ea933f..075bb3d 100644 --- a/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py +++ b/apps/multimodal/vision-based-pdf-multi-vector/leann_multi_vector.py @@ -152,20 +152,65 @@ def _select_device_and_dtype(): def _load_colvision(model_choice: str): import torch - from colpali_engine.models import ColPali, ColQwen2, ColQwen2Processor + from colpali_engine.models import ( + ColPali, + ColQwen2, + ColQwen2_5, + ColQwen2_5_Processor, + ColQwen2Processor, + ) from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor from transformers.utils.import_utils import is_flash_attn_2_available device_str, device, dtype = _select_device_and_dtype() + # Determine model name and type + # IMPORTANT: Check colqwen2.5 BEFORE colqwen2 to avoid false matches + model_choice_lower = model_choice.lower() if model_choice == "colqwen2": model_name = "vidore/colqwen2-v1.0" - # On CPU/MPS we must avoid flash-attn and stay eager; on CUDA prefer flash-attn if available - attn_implementation = ( - "flash_attention_2" - if (device_str == "cuda" and is_flash_attn_2_available()) - else "eager" - ) + model_type = "colqwen2" + elif model_choice == "colqwen2.5" or model_choice == "colqwen25": + model_name = "vidore/colqwen2.5-v0.2" + model_type = "colqwen2.5" + elif model_choice == "colpali": + model_name = "vidore/colpali-v1.2" + model_type = "colpali" + elif ( + "colqwen2.5" in model_choice_lower + or "colqwen25" in model_choice_lower + or "colqwen2_5" in model_choice_lower + ): + # Handle HuggingFace model names like "vidore/colqwen2.5-v0.2" + model_name = model_choice + model_type = "colqwen2.5" + elif "colqwen2" in model_choice_lower and "colqwen2-v1.0" in model_choice_lower: + # Handle HuggingFace model names like "vidore/colqwen2-v1.0" (but not colqwen2.5) + model_name = model_choice + model_type = "colqwen2" + elif "colpali" in model_choice_lower: + # Handle HuggingFace model names like "vidore/colpali-v1.2" + model_name = model_choice + model_type = "colpali" + else: + # Default to colpali for backward compatibility + model_name = "vidore/colpali-v1.2" + model_type = "colpali" + + # Load model based on type + attn_implementation = ( + "flash_attention_2" if (device_str == "cuda" and is_flash_attn_2_available()) else "eager" + ) + + if model_type == "colqwen2.5": + model = ColQwen2_5.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device, + attn_implementation=attn_implementation, + ).eval() + processor = ColQwen2_5_Processor.from_pretrained(model_name) + elif model_type == "colqwen2": model = ColQwen2.from_pretrained( model_name, torch_dtype=torch.bfloat16, @@ -173,8 +218,7 @@ def _load_colvision(model_choice: str): attn_implementation=attn_implementation, ).eval() processor = ColQwen2Processor.from_pretrained(model_name) - else: - model_name = "vidore/colpali-v1.2" + else: # colpali model = ColPali.from_pretrained( model_name, torch_dtype=torch.bfloat16, diff --git a/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py b/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py index e68a689..79472df 100644 --- a/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py +++ b/apps/multimodal/vision-based-pdf-multi-vector/vidore_v1_benchmark.py @@ -90,6 +90,51 @@ VIDORE_V1_TASKS = { }, } +# Task name aliases (short names -> full names) +TASK_ALIASES = { + "arxivqa": "VidoreArxivQARetrieval", + "docvqa": "VidoreDocVQARetrieval", + "infovqa": "VidoreInfoVQARetrieval", + "tabfquad": "VidoreTabfquadRetrieval", + "tatdqa": "VidoreTatdqaRetrieval", + "shiftproject": "VidoreShiftProjectRetrieval", + "syntheticdocqa_ai": "VidoreSyntheticDocQAAIRetrieval", + "syntheticdocqa_energy": "VidoreSyntheticDocQAEnergyRetrieval", + "syntheticdocqa_government": "VidoreSyntheticDocQAGovernmentReportsRetrieval", + "syntheticdocqa_healthcare": "VidoreSyntheticDocQAHealthcareIndustryRetrieval", +} + + +def normalize_task_name(task_name: str) -> str: + """Normalize task name (handle aliases).""" + task_name_lower = task_name.lower() + if task_name in VIDORE_V1_TASKS: + return task_name + if task_name_lower in TASK_ALIASES: + return TASK_ALIASES[task_name_lower] + # Try partial match + for alias, full_name in TASK_ALIASES.items(): + if alias in task_name_lower or task_name_lower in alias: + return full_name + return task_name + + +def get_safe_model_name(model_name: str) -> str: + """Get a safe model name for use in file paths.""" + import hashlib + import os + + # If it's a path, use basename or hash + if os.path.exists(model_name) and os.path.isdir(model_name): + # Use basename if it's reasonable, otherwise use hash + basename = os.path.basename(model_name.rstrip("/")) + if basename and len(basename) < 100 and not basename.startswith("."): + return basename + # Use hash for very long or problematic paths + return hashlib.md5(model_name.encode()).hexdigest()[:16] + # For HuggingFace model names, replace / with _ + return model_name.replace("/", "_").replace(":", "_") + def load_vidore_v1_data( dataset_path: str, @@ -181,6 +226,9 @@ def evaluate_task( print(f"Evaluating task: {task_name}") print(f"{'=' * 80}") + # Normalize task name (handle aliases) + task_name = normalize_task_name(task_name) + # Get task config if task_name not in VIDORE_V1_TASKS: raise ValueError(f"Unknown task: {task_name}. Available: {list(VIDORE_V1_TASKS.keys())}") @@ -223,11 +271,13 @@ def evaluate_task( ) # Build or load index + # Use safe model name for index path (different models need different indexes) + safe_model_name = get_safe_model_name(model_name) index_path_full = index_path if not use_fast_plaid else fast_plaid_index_path if index_path_full is None: - index_path_full = f"./indexes/{task_name}_{model_name}" + index_path_full = f"./indexes/{task_name}_{safe_model_name}" if use_fast_plaid: - index_path_full = f"./indexes/{task_name}_{model_name}_fastplaid" + index_path_full = f"./indexes/{task_name}_{safe_model_name}_fastplaid" index_or_retriever, corpus_ids_ordered = evaluator.build_index_from_corpus( corpus=corpus, @@ -281,8 +331,7 @@ def main(): "--model", type=str, default="colqwen2", - choices=["colqwen2", "colpali"], - help="Model to use", + help="Model to use: 'colqwen2', 'colpali', or path to a model directory (supports LoRA adapters)", ) parser.add_argument( "--task", @@ -350,11 +399,11 @@ def main(): # Determine tasks to evaluate if args.task: - tasks_to_eval = [args.task] + tasks_to_eval = [normalize_task_name(args.task)] elif args.tasks.lower() == "all": tasks_to_eval = list(VIDORE_V1_TASKS.keys()) else: - tasks_to_eval = [t.strip() for t in args.tasks.split(",")] + tasks_to_eval = [normalize_task_name(t.strip()) for t in args.tasks.split(",")] print(f"Tasks to evaluate: {tasks_to_eval}")