perf: switch to tranditional pdf reader

This commit is contained in:
Andy Lee
2025-07-13 17:04:06 -07:00
parent c69afb56e4
commit 53c58fa755

View File

@@ -3,10 +3,7 @@ faulthandler.enable()
import argparse import argparse
from llama_index.core import SimpleDirectoryReader, Settings from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.readers.base import BaseReader from llama_index.core.node_parser import SentenceSplitter
from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.readers.docling import DoclingReader
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
import asyncio import asyncio
import dotenv import dotenv
from leann.api import LeannBuilder, LeannSearcher, LeannChat from leann.api import LeannBuilder, LeannSearcher, LeannChat
@@ -15,25 +12,18 @@ from pathlib import Path
dotenv.load_dotenv() dotenv.load_dotenv()
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON) node_parser = SentenceSplitter(
file_extractor: dict[str, BaseReader] = { chunk_size=256,
".docx": reader, chunk_overlap=20,
".pptx": reader, separator=" ",
".pdf": reader, paragraph_separator="\n\n"
".xlsx": reader,
".txt": reader,
".md": reader,
}
node_parser = DoclingNodeParser(
chunker=HybridChunker(tokenizer="facebook/contriever", max_tokens=128)
) )
print("Loading documents...") print("Loading documents...")
documents = SimpleDirectoryReader( documents = SimpleDirectoryReader(
"examples/data", "examples/data",
recursive=True, recursive=True,
file_extractor=file_extractor,
encoding="utf-8", encoding="utf-8",
required_exts=[".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md"] required_exts=[".pdf", ".txt", ".md"]
).load_data(show_progress=True) ).load_data(show_progress=True)
print("Documents loaded.") print("Documents loaded.")
all_texts = [] all_texts = []