merge main
This commit is contained in:
@@ -1,3 +1,6 @@
|
||||
import faulthandler
|
||||
faulthandler.enable()
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader, Settings
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.node_parser.docling import DoclingNodeParser
|
||||
@@ -7,7 +10,7 @@ import asyncio
|
||||
import os
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
import leann_backend_diskann # Import to ensure backend registration
|
||||
import leann_backend_hnsw # Import to ensure backend registration
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
@@ -21,7 +24,7 @@ file_extractor: dict[str, BaseReader] = {
|
||||
".xlsx": reader,
|
||||
}
|
||||
node_parser = DoclingNodeParser(
|
||||
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
|
||||
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64)
|
||||
)
|
||||
print("Loading documents...")
|
||||
documents = SimpleDirectoryReader(
|
||||
@@ -32,10 +35,8 @@ documents = SimpleDirectoryReader(
|
||||
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
|
||||
).load_data(show_progress=True)
|
||||
print("Documents loaded.")
|
||||
# Extract text from documents and prepare for Leann
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
# DoclingNodeParser returns Node objects, which have a text attribute
|
||||
nodes = node_parser.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.text)
|
||||
@@ -43,32 +44,35 @@ for doc in documents:
|
||||
INDEX_DIR = Path("./test_pdf_index")
|
||||
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
||||
|
||||
if INDEX_DIR.exists():
|
||||
print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
|
||||
shutil.rmtree(INDEX_DIR)
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever", # Using a common sentence transformer model
|
||||
graph_degree=32,
|
||||
complexity=64
|
||||
)
|
||||
|
||||
print(f"Loaded {len(all_texts)} text chunks from documents.")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
|
||||
builder.build_index(INDEX_PATH)
|
||||
print(f"\nLeann index built at {INDEX_PATH}!")
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# CSR compact mode with recompute
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True
|
||||
)
|
||||
|
||||
print(f"Loaded {len(all_texts)} text chunks from documents.")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
builder.build_index(INDEX_PATH)
|
||||
print(f"\nLeann index built at {INDEX_PATH}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
async def main():
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=INDEX_PATH)
|
||||
|
||||
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
|
||||
# query = "What is the Off-policy training in RL?"
|
||||
print(f"You: {query}")
|
||||
chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True,embedding_model="facebook/contriever")
|
||||
print(f"Leann: {chat_response}")
|
||||
|
||||
@@ -3,11 +3,17 @@ Simple demo showing basic leann usage
|
||||
Run: uv run python examples/simple_demo.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from leann import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
|
||||
def main():
|
||||
print("=== Leann Simple Demo ===")
|
||||
parser = argparse.ArgumentParser(description="Simple demo of Leann with selectable embedding models.")
|
||||
parser.add_argument("--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2",
|
||||
help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"=== Leann Simple Demo with {args.embedding_model} ===")
|
||||
print()
|
||||
|
||||
# Sample knowledge base
|
||||
@@ -24,10 +30,11 @@ def main():
|
||||
|
||||
print("1. Building index (no embeddings stored)...")
|
||||
builder = LeannBuilder(
|
||||
embedding_model="sentence-transformers/all-mpnet-base-v2",
|
||||
prune_ratio=0.7, # Keep 30% of connections
|
||||
embedding_model=args.embedding_model,
|
||||
backend_name="hnsw",
|
||||
)
|
||||
builder.add_chunks(chunks)
|
||||
for chunk in chunks:
|
||||
builder.add_text(chunk)
|
||||
builder.build_index("demo_knowledge.leann")
|
||||
print()
|
||||
|
||||
@@ -49,14 +56,7 @@ def main():
|
||||
print(f" Text: {result.text[:100]}...")
|
||||
print()
|
||||
|
||||
print("3. Memory stats:")
|
||||
stats = searcher.get_memory_stats()
|
||||
print(f" Cache size: {stats.embedding_cache_size}")
|
||||
print(f" Cache memory: {stats.embedding_cache_memory_mb:.1f} MB")
|
||||
print(f" Total chunks: {stats.total_chunks}")
|
||||
print()
|
||||
|
||||
print("4. Interactive chat demo:")
|
||||
print("3. Interactive chat demo:")
|
||||
print(" (Note: Requires OpenAI API key for real responses)")
|
||||
|
||||
chat = LeannChat("demo_knowledge.leann")
|
||||
|
||||
Reference in New Issue
Block a user