merge main

2025-07-06 00:50:58 +00:00
parent e92deee1e8 910927a405
commit df63526503
12 changed files with 1082 additions and 221 deletions
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -1,3 +1,6 @@
+import faulthandler
+faulthandler.enable()
+
 from llama_index.core import SimpleDirectoryReader, Settings
 from llama_index.core.readers.base import BaseReader
 from llama_index.node_parser.docling import DoclingNodeParser
@@ -7,7 +10,7 @@ import asyncio
 import os
 import dotenv
 from leann.api import LeannBuilder, LeannSearcher, LeannChat
-import leann_backend_diskann # Import to ensure backend registration
+import leann_backend_hnsw # Import to ensure backend registration
 import shutil
 from pathlib import Path

@@ -21,7 +24,7 @@ file_extractor: dict[str, BaseReader] = {
    ".xlsx": reader,
 }
 node_parser = DoclingNodeParser(
-    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
+    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64)
 )
 print("Loading documents...")
 documents = SimpleDirectoryReader(
@@ -32,10 +35,8 @@ documents = SimpleDirectoryReader(
    required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
 ).load_data(show_progress=True)
 print("Documents loaded.")
-# Extract text from documents and prepare for Leann
 all_texts = []
 for doc in documents:
-    # DoclingNodeParser returns Node objects, which have a text attribute
    nodes = node_parser.get_nodes_from_documents([doc])
    for node in nodes:
        all_texts.append(node.text)
@@ -43,32 +44,35 @@ for doc in documents:
 INDEX_DIR = Path("./test_pdf_index")
 INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")

-if INDEX_DIR.exists():
-    print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
-    shutil.rmtree(INDEX_DIR)
-
-print(f"\n[PHASE 1] Building Leann index...")
-
-builder = LeannBuilder(
-    backend_name="diskann",
-    embedding_model="facebook/contriever", # Using a common sentence transformer model
-    graph_degree=32, 
-    complexity=64
-)
-
-print(f"Loaded {len(all_texts)} text chunks from documents.")
-for chunk_text in all_texts:
-    builder.add_text(chunk_text)
+if not INDEX_DIR.exists():
+    print(f"--- Index directory not found, building new index ---")
    
-builder.build_index(INDEX_PATH)
-print(f"\nLeann index built at {INDEX_PATH}!")
+    print(f"\n[PHASE 1] Building Leann index...")
+
+    # CSR compact mode with recompute
+    builder = LeannBuilder(
+        backend_name="hnsw",
+        embedding_model="facebook/contriever",
+        graph_degree=32, 
+        complexity=64,
+        is_compact=True,
+        is_recompute=True
+    )
+
+    print(f"Loaded {len(all_texts)} text chunks from documents.")
+    for chunk_text in all_texts:
+        builder.add_text(chunk_text)
+        
+    builder.build_index(INDEX_PATH)
+    print(f"\nLeann index built at {INDEX_PATH}!")
+else:
+    print(f"--- Using existing index at {INDEX_DIR} ---")

 async def main():
    print(f"\n[PHASE 2] Starting Leann chat session...")
    chat = LeannChat(index_path=INDEX_PATH)
    
    query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
-    # query = "What is the Off-policy training in RL?"
    print(f"You: {query}")
    chat_response = chat.ask(query, top_k=20, recompute_beighbor_embeddings=True,embedding_model="facebook/contriever")
    print(f"Leann: {chat_response}")
--- a/examples/simple_demo.py
+++ b/examples/simple_demo.py
@@ -3,11 +3,17 @@ Simple demo showing basic leann usage
 Run: uv run python examples/simple_demo.py
 """

+import argparse
 from leann import LeannBuilder, LeannSearcher, LeannChat


 def main():
-    print("=== Leann Simple Demo ===")
+    parser = argparse.ArgumentParser(description="Simple demo of Leann with selectable embedding models.")
+    parser.add_argument("--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2",
+                        help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.")
+    args = parser.parse_args()
+
+    print(f"=== Leann Simple Demo with {args.embedding_model} ===")
    print()
    
    # Sample knowledge base
@@ -24,10 +30,11 @@ def main():
    
    print("1. Building index (no embeddings stored)...")
    builder = LeannBuilder(
-        embedding_model="sentence-transformers/all-mpnet-base-v2",
-        prune_ratio=0.7,  # Keep 30% of connections
+        embedding_model=args.embedding_model,
+        backend_name="hnsw",
    )
-    builder.add_chunks(chunks)
+    for chunk in chunks:
+        builder.add_text(chunk)
    builder.build_index("demo_knowledge.leann")
    print()
    
@@ -49,14 +56,7 @@ def main():
            print(f"     Text: {result.text[:100]}...")
        print()
    
-    print("3. Memory stats:")
-    stats = searcher.get_memory_stats()
-    print(f"   Cache size: {stats.embedding_cache_size}")
-    print(f"   Cache memory: {stats.embedding_cache_memory_mb:.1f} MB") 
-    print(f"   Total chunks: {stats.total_chunks}")
-    print()
-    
-    print("4. Interactive chat demo:")
+    print("3. Interactive chat demo:")
    print("   (Note: Requires OpenAI API key for real responses)")
    
    chat = LeannChat("demo_knowledge.leann")