perf: switch to tranditional pdf reader
This commit is contained in:
@@ -3,10 +3,7 @@ faulthandler.enable()
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from llama_index.core import SimpleDirectoryReader, Settings
|
from llama_index.core import SimpleDirectoryReader, Settings
|
||||||
from llama_index.core.readers.base import BaseReader
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
from llama_index.node_parser.docling import DoclingNodeParser
|
|
||||||
from llama_index.readers.docling import DoclingReader
|
|
||||||
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import dotenv
|
import dotenv
|
||||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||||
@@ -15,25 +12,18 @@ from pathlib import Path
|
|||||||
|
|
||||||
dotenv.load_dotenv()
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
|
node_parser = SentenceSplitter(
|
||||||
file_extractor: dict[str, BaseReader] = {
|
chunk_size=256,
|
||||||
".docx": reader,
|
chunk_overlap=20,
|
||||||
".pptx": reader,
|
separator=" ",
|
||||||
".pdf": reader,
|
paragraph_separator="\n\n"
|
||||||
".xlsx": reader,
|
|
||||||
".txt": reader,
|
|
||||||
".md": reader,
|
|
||||||
}
|
|
||||||
node_parser = DoclingNodeParser(
|
|
||||||
chunker=HybridChunker(tokenizer="facebook/contriever", max_tokens=128)
|
|
||||||
)
|
)
|
||||||
print("Loading documents...")
|
print("Loading documents...")
|
||||||
documents = SimpleDirectoryReader(
|
documents = SimpleDirectoryReader(
|
||||||
"examples/data",
|
"examples/data",
|
||||||
recursive=True,
|
recursive=True,
|
||||||
file_extractor=file_extractor,
|
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
required_exts=[".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md"]
|
required_exts=[".pdf", ".txt", ".md"]
|
||||||
).load_data(show_progress=True)
|
).load_data(show_progress=True)
|
||||||
print("Documents loaded.")
|
print("Documents loaded.")
|
||||||
all_texts = []
|
all_texts = []
|
||||||
|
|||||||
Reference in New Issue
Block a user