Initial commit

2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""
+Document search demo with recompute mode
+"""
+
+import os
+from pathlib import Path
+import shutil
+import time
+
+# Import backend packages to trigger plugin registration
+try:
+    import leann_backend_diskann
+    import leann_backend_hnsw
+    print("INFO: Backend packages imported successfully.")
+except ImportError as e:
+    print(f"WARNING: Could not import backend packages. Error: {e}")
+
+# Import upper-level API from leann-core
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+
+
+def load_sample_documents():
+    """Create sample documents for demonstration"""
+    docs = [
+        {"title": "Intro to Python", "content": "Python is a high-level, interpreted language known for simplicity."},
+        {"title": "ML Basics", "content": "Machine learning builds systems that learn from data."},
+        {"title": "Data Structures", "content": "Data structures like arrays, lists, and graphs organize data."},
+    ]
+    return docs
+
+def main():
+    print("==========================================================")
+    print("=== Leann Document Search Demo (DiskANN + Recompute) ===")
+    print("==========================================================")
+    
+    INDEX_DIR = Path("./test_indices")
+    INDEX_PATH = str(INDEX_DIR / "documents.diskann")
+    BACKEND_TO_TEST = "diskann"
+
+    if INDEX_DIR.exists():
+        print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
+        shutil.rmtree(INDEX_DIR)
+
+    # --- 1. Build index ---
+    print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...")
+    
+    builder = LeannBuilder(
+        backend_name=BACKEND_TO_TEST, 
+        graph_degree=32, 
+        complexity=64
+    )
+    
+    documents = load_sample_documents()
+    print(f"Loaded {len(documents)} sample documents.")
+    for doc in documents:
+        builder.add_text(doc["content"], metadata={"title": doc["title"]})
+        
+    builder.build_index(INDEX_PATH)
+    print(f"\nIndex built!")
+
+    # --- 2. Basic search demo ---
+    print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...")
+    searcher = LeannSearcher(index_path=INDEX_PATH)
+    
+    query = "What is machine learning?"
+    print(f"\nQuery: '{query}'")
+    
+    print("\n--- Basic search mode (PQ computation) ---")
+    start_time = time.time()
+    results = searcher.search(query, top_k=2)
+    basic_time = time.time() - start_time
+    
+    print(f"⏱️  Basic search time: {basic_time:.3f} seconds")
+    print(">>> Basic search results <<<")
+    for i, res in enumerate(results, 1):
+        print(f"  {i}. ID: {res['id']}, Score: {res['score']:.4f}, Text: '{res['text']}', Metadata: {res['metadata']}")
+
+    # --- 3. Recompute search demo ---
+    print(f"\n[PHASE 3] Recompute search using embedding server...")
+    
+    print("\n--- Recompute search mode (get real embeddings via network) ---")
+    
+    # Configure recompute parameters
+    recompute_params = {
+        "recompute_beighbor_embeddings": True,  # Enable network recomputation
+        "USE_DEFERRED_FETCH": False,           # Don't use deferred fetch
+        "skip_search_reorder": True,           # Skip search reordering
+        "dedup_node_dis": True,               # Enable node distance deduplication
+        "prune_ratio": 0.1,                   # Pruning ratio 10%
+        "batch_recompute": False,             # Don't use batch recomputation
+        "global_pruning": False,              # Don't use global pruning
+        "zmq_port": 5555,                     # ZMQ port
+        "embedding_model": "sentence-transformers/all-mpnet-base-v2"
+    }
+    
+    print("Recompute parameter configuration:")
+    for key, value in recompute_params.items():
+        print(f"  {key}: {value}")
+    
+    print(f"\n🔄 Executing Recompute search...")
+    try:
+        start_time = time.time()
+        recompute_results = searcher.search(query, top_k=2, **recompute_params)
+        recompute_time = time.time() - start_time
+        
+        print(f"⏱️  Recompute search time: {recompute_time:.3f} seconds")
+        print(">>> Recompute search results <<<")
+        for i, res in enumerate(recompute_results, 1):
+            print(f"  {i}. ID: {res['id']}, Score: {res['score']:.4f}, Text: '{res['text']}', Metadata: {res['metadata']}")
+        
+        # Compare results
+        print(f"\n--- Result comparison ---")
+        print(f"Basic search time: {basic_time:.3f} seconds")
+        print(f"Recompute time: {recompute_time:.3f} seconds")
+        
+        print("\nBasic search vs Recompute results:")
+        for i in range(min(len(results), len(recompute_results))):
+            basic_score = results[i]['score']
+            recompute_score = recompute_results[i]['score']
+            score_diff = abs(basic_score - recompute_score)
+            print(f"  Position {i+1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}")
+        
+        if recompute_time > basic_time:
+            print(f"✅ Recompute mode working correctly (more accurate but slower)")
+        else:
+            print(f"ℹ️  Recompute time is unusually fast, network recomputation may not be enabled")
+            
+    except Exception as e:
+        print(f"❌ Recompute search failed: {e}")
+        print("This usually indicates an embedding server connection issue")
+
+    # --- 4. Chat demo ---
+    print(f"\n[PHASE 4] Starting chat session...")
+    chat = LeannChat(index_path=INDEX_PATH)
+    chat_response = chat.ask(query)
+    print(f"You: {query}")
+    print(f"Leann: {chat_response}")
+
+    print("\n==========================================================")
+    print("✅ Demo finished successfully!")
+    print("==========================================================")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,76 @@
+from llama_index.core import SimpleDirectoryReader, Settings
+from llama_index.core.readers.base import BaseReader
+from llama_index.node_parser.docling import DoclingNodeParser
+from llama_index.readers.docling import DoclingReader
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+import asyncio
+import os
+import dotenv
+from leann.api import LeannBuilder, LeannSearcher, LeannChat
+import leann_backend_diskann # Import to ensure backend registration
+import shutil
+from pathlib import Path
+
+dotenv.load_dotenv()
+
+reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
+file_extractor: dict[str, BaseReader] = {
+    ".docx": reader,
+    ".pptx": reader, 
+    ".pdf": reader,
+    ".xlsx": reader,
+}
+node_parser = DoclingNodeParser(
+    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=10240)
+)
+
+documents = SimpleDirectoryReader(
+    "examples/data", 
+    recursive=True, 
+    file_extractor=file_extractor,
+    encoding="utf-8",
+    required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
+).load_data(show_progress=True)
+
+# Extract text from documents and prepare for Leann
+all_texts = []
+for doc in documents:
+    # DoclingNodeParser returns Node objects, which have a text attribute
+    nodes = node_parser.get_nodes_from_documents([doc])
+    for node in nodes:
+        all_texts.append(node.text)
+
+INDEX_DIR = Path("./test_pdf_index")
+INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
+
+if INDEX_DIR.exists():
+    print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
+    shutil.rmtree(INDEX_DIR)
+
+print(f"\n[PHASE 1] Building Leann index...")
+
+builder = LeannBuilder(
+    backend_name="diskann",
+    embedding_model="sentence-transformers/all-mpnet-base-v2", # Using a common sentence transformer model
+    graph_degree=32, 
+    complexity=64
+)
+
+print(f"Loaded {len(all_texts)} text chunks from documents.")
+for chunk_text in all_texts:
+    builder.add_text(chunk_text)
+    
+builder.build_index(INDEX_PATH)
+print(f"\nLeann index built at {INDEX_PATH}!")
+
+async def main():
+    print(f"\n[PHASE 2] Starting Leann chat session...")
+    chat = LeannChat(index_path=INDEX_PATH)
+    
+    query = "Based on the paper, what are the two main techniques LEANN uses to achieve low storage overhead and high retrieval accuracy?"
+    print(f"You: {query}")
+    chat_response = chat.ask(query, recompute_beighbor_embeddings=True)
+    print(f"Leann: {chat_response}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,81 @@
+"""
+Simple demo showing basic leann usage
+Run: uv run python examples/simple_demo.py
+"""
+
+from leann import LeannBuilder, LeannSearcher, LeannChat
+
+
+def main():
+    print("=== Leann Simple Demo ===")
+    print()
+    
+    # Sample knowledge base
+    chunks = [
+        "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
+        "Deep learning uses neural networks with multiple layers to process data and make decisions.",
+        "Natural language processing helps computers understand and generate human language.",
+        "Computer vision enables machines to interpret and understand visual information from images and videos.",
+        "Reinforcement learning teaches agents to make decisions by receiving rewards or penalties for their actions.",
+        "Data science combines statistics, programming, and domain expertise to extract insights from data.",
+        "Big data refers to extremely large datasets that require special tools and techniques to process.",
+        "Cloud computing provides on-demand access to computing resources over the internet.",
+    ]
+    
+    print("1. Building index (no embeddings stored)...")
+    builder = LeannBuilder(
+        embedding_model="sentence-transformers/all-mpnet-base-v2",
+        prune_ratio=0.7,  # Keep 30% of connections
+    )
+    builder.add_chunks(chunks)
+    builder.build_index("demo_knowledge.leann")
+    print()
+    
+    print("2. Searching with real-time embeddings...")
+    searcher = LeannSearcher("demo_knowledge.leann")
+    
+    queries = [
+        "What is machine learning?",
+        "How does neural network work?", 
+        "Tell me about data processing",
+    ]
+    
+    for query in queries:
+        print(f"Query: {query}")
+        results = searcher.search(query, top_k=2)
+        
+        for i, result in enumerate(results, 1):
+            print(f"  {i}. Score: {result.score:.3f}")
+            print(f"     Text: {result.text[:100]}...")
+        print()
+    
+    print("3. Memory stats:")
+    stats = searcher.get_memory_stats()
+    print(f"   Cache size: {stats.embedding_cache_size}")
+    print(f"   Cache memory: {stats.embedding_cache_memory_mb:.1f} MB") 
+    print(f"   Total chunks: {stats.total_chunks}")
+    print()
+    
+    print("4. Interactive chat demo:")
+    print("   (Note: Requires OpenAI API key for real responses)")
+    
+    chat = LeannChat("demo_knowledge.leann")
+    
+    # Demo questions
+    demo_questions: list[str] = [
+        "What is the difference between machine learning and deep learning?",
+        "How is data science related to big data?",
+    ]
+    
+    for question in demo_questions:
+        print(f"   Q: {question}")
+        response = chat.ask(question)
+        print(f"   A: {response}")
+        print()
+    
+    print("Demo completed! Try running:")
+    print("   uv run python examples/document_search.py")
+
+
+if __name__ == "__main__":
+    main()