From e268392d5b0c75dd3793ed24c4c2866b86ccd94e Mon Sep 17 00:00:00 2001 From: Aakash Suresh Date: Mon, 1 Dec 2025 13:48:44 -0800 Subject: [PATCH] Fix: Prevent duplicate PDF processing when using --file-types .pdf (#179) Fixes #175 Problem: When --file-types .pdf is specified, PDFs were being processed twice: 1. Separately with PyMuPDF/pdfplumber extractors 2. Again in the 'other file types' section via SimpleDirectoryReader This caused duplicate processing and potential conflicts. Solution: - Exclude .pdf from other_file_extensions when PDFs are already processed separately - Only load other file types if there are extensions to process - Prevents duplicate PDF processing Changes: - Added logic to filter out .pdf from code_extensions when loading other file types if PDFs were processed separately - Updated SimpleDirectoryReader to use filtered extensions - Added check to skip loading if no other extensions to process --- packages/leann-core/src/leann/cli.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 122ae6b..79b5e1d 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1180,6 +1180,11 @@ Examples: print(f"Warning: Could not process {file_path}: {e}") # Load other file types with default reader + # Exclude PDFs from code_extensions if they were already processed separately + other_file_extensions = code_extensions + if should_process_pdfs and ".pdf" in code_extensions: + other_file_extensions = [ext for ext in code_extensions if ext != ".pdf"] + try: # Create a custom file filter function using our PathSpec def file_filter( @@ -1195,15 +1200,19 @@ Examples: except (ValueError, OSError): return True # Include files that can't be processed - other_docs = SimpleDirectoryReader( - docs_dir, - recursive=True, - encoding="utf-8", - required_exts=code_extensions, - file_extractor={}, # Use default extractors - exclude_hidden=not include_hidden, - filename_as_id=True, - ).load_data(show_progress=True) + # Only load other file types if there are extensions to process + if other_file_extensions: + other_docs = SimpleDirectoryReader( + docs_dir, + recursive=True, + encoding="utf-8", + required_exts=other_file_extensions, + file_extractor={}, # Use default extractors + exclude_hidden=not include_hidden, + filename_as_id=True, + ).load_data(show_progress=True) + else: + other_docs = [] # Filter documents after loading based on gitignore rules filtered_docs = []