From 53c58fa7559fbf29171cd739b87b0ca4b2dd2d5c Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 13 Jul 2025 17:04:06 -0700 Subject: [PATCH] perf: switch to tranditional pdf reader --- examples/main_cli_example.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py index fc87cfc..77b0bec 100644 --- a/examples/main_cli_example.py +++ b/examples/main_cli_example.py @@ -3,10 +3,7 @@ faulthandler.enable() import argparse from llama_index.core import SimpleDirectoryReader, Settings -from llama_index.core.readers.base import BaseReader -from llama_index.node_parser.docling import DoclingNodeParser -from llama_index.readers.docling import DoclingReader -from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +from llama_index.core.node_parser import SentenceSplitter import asyncio import dotenv from leann.api import LeannBuilder, LeannSearcher, LeannChat @@ -15,25 +12,18 @@ from pathlib import Path dotenv.load_dotenv() -reader = DoclingReader(export_type=DoclingReader.ExportType.JSON) -file_extractor: dict[str, BaseReader] = { - ".docx": reader, - ".pptx": reader, - ".pdf": reader, - ".xlsx": reader, - ".txt": reader, - ".md": reader, -} -node_parser = DoclingNodeParser( - chunker=HybridChunker(tokenizer="facebook/contriever", max_tokens=128) +node_parser = SentenceSplitter( + chunk_size=256, + chunk_overlap=20, + separator=" ", + paragraph_separator="\n\n" ) print("Loading documents...") documents = SimpleDirectoryReader( "examples/data", - recursive=True, - file_extractor=file_extractor, + recursive=True, encoding="utf-8", - required_exts=[".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md"] + required_exts=[".pdf", ".txt", ".md"] ).load_data(show_progress=True) print("Documents loaded.") all_texts = []