Merge remote-tracking branch 'origin/main' into datastore-reproduce
This commit is contained in:
@@ -8,7 +8,6 @@ from llama_index.node_parser.docling import DoclingNodeParser
|
||||
from llama_index.readers.docling import DoclingReader
|
||||
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
||||
import asyncio
|
||||
import os
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
import shutil
|
||||
@@ -22,9 +21,11 @@ file_extractor: dict[str, BaseReader] = {
|
||||
".pptx": reader,
|
||||
".pdf": reader,
|
||||
".xlsx": reader,
|
||||
".txt": reader,
|
||||
".md": reader,
|
||||
}
|
||||
node_parser = DoclingNodeParser(
|
||||
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64)
|
||||
chunker=HybridChunker(tokenizer="facebook/contriever", max_tokens=128)
|
||||
)
|
||||
print("Loading documents...")
|
||||
documents = SimpleDirectoryReader(
|
||||
@@ -32,7 +33,7 @@ documents = SimpleDirectoryReader(
|
||||
recursive=True,
|
||||
file_extractor=file_extractor,
|
||||
encoding="utf-8",
|
||||
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
|
||||
required_exts=[".pdf", ".docx", ".pptx", ".xlsx", ".txt", ".md"]
|
||||
).load_data(show_progress=True)
|
||||
print("Documents loaded.")
|
||||
all_texts = []
|
||||
@@ -41,7 +42,7 @@ for doc in documents:
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
INDEX_DIR = Path("./test_pdf_index")
|
||||
INDEX_DIR = Path("./test_pdf_index_pangu_test")
|
||||
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
@@ -49,14 +50,15 @@ if not INDEX_DIR.exists():
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# CSR compact mode with recompute
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True
|
||||
is_recompute=True,
|
||||
num_threads=1 # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Loaded {len(all_texts)} text chunks from documents.")
|
||||
@@ -80,14 +82,17 @@ async def main(args):
|
||||
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
||||
|
||||
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
|
||||
query = "What is the main idea of RL and give me 5 exapmle of classic RL algorithms?"
|
||||
query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
|
||||
print(f"You: {query}")
|
||||
chat_response = chat.ask(query, top_k=3, recompute_beighbor_embeddings=True)
|
||||
chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True, complexity=32, beam_width=1)
|
||||
print(f"Leann: {chat_response}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run Leann Chat with various LLM backends.")
|
||||
parser.add_argument("--llm", type=str, default="hf", choices=["simulated", "ollama", "hf"], help="The LLM backend to use.")
|
||||
parser.add_argument("--model", type=str, default='meta-llama/Llama-3.2-3B-Instruct', help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf).")
|
||||
parser.add_argument("--llm", type=str, default="hf", choices=["simulated", "ollama", "hf", "openai"], help="The LLM backend to use.")
|
||||
parser.add_argument("--model", type=str, default='meta-llama/Llama-3.2-3B-Instruct', help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).")
|
||||
parser.add_argument("--host", type=str, default="http://localhost:11434", help="The host for the Ollama API.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user