Fix: Prevent duplicate PDF processing when using --file-types .pdf

Fixes #175 Problem: When --file-types .pdf is specified, PDFs were being processed twice: 1. Separately with PyMuPDF/pdfplumber extractors 2. Again in the 'other file types' section via SimpleDirectoryReader This caused duplicate processing and potential conflicts. Solution: - Exclude .pdf from other_file_extensions when PDFs are already processed separately - Only load other file types if there are extensions to process - Prevents duplicate PDF processing Changes: - Added logic to filter out .pdf from code_extensions when loading other file types if PDFs were processed separately - Updated SimpleDirectoryReader to use filtered extensions - Added check to skip loading if no other extensions to process
2025-11-25 20:23:01 -05:00
parent 13beb98164
commit 2afcdf7b77
1 changed files with 18 additions and 9 deletions
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -1162,6 +1162,11 @@ Examples:
                            print(f"Warning: Could not process {file_path}: {e}")
            # Load other file types with default reader
            # Exclude PDFs from code_extensions if they were already processed separately
            other_file_extensions = code_extensions
            if should_process_pdfs and ".pdf" in code_extensions:
                other_file_extensions = [ext for ext in code_extensions if ext != ".pdf"]
            try:
                # Create a custom file filter function using our PathSpec
                def file_filter(
@@ -1177,15 +1182,19 @@ Examples:
                    except (ValueError, OSError):
                        return True  # Include files that can't be processed
-                other_docs = SimpleDirectoryReader(
+                # Only load other file types if there are extensions to process
-                    docs_dir,
+                if other_file_extensions:
-                    recursive=True,
+                    other_docs = SimpleDirectoryReader(
-                    encoding="utf-8",
+                        docs_dir,
-                    required_exts=code_extensions,
+                        recursive=True,
-                    file_extractor={},  # Use default extractors
+                        encoding="utf-8",
-                    exclude_hidden=not include_hidden,
+                        required_exts=other_file_extensions,
-                    filename_as_id=True,
+                        file_extractor={},  # Use default extractors
-                ).load_data(show_progress=True)
+                        exclude_hidden=not include_hidden,
                        filename_as_id=True,
                    ).load_data(show_progress=True)
                else:
                    other_docs = []
                # Filter documents after loading based on gitignore rules
                filtered_docs = []