Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

File diff suppressed because it is too large Load Diff

146
examples/document_search.py Normal file
View File

@@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""
Document search demo with recompute mode
"""
import os
from pathlib import Path
import shutil
import time
# Import backend packages to trigger plugin registration
try:
import leann_backend_diskann
import leann_backend_hnsw
print("INFO: Backend packages imported successfully.")
except ImportError as e:
print(f"WARNING: Could not import backend packages. Error: {e}")
# Import upper-level API from leann-core
from leann.api import LeannBuilder, LeannSearcher, LeannChat
def load_sample_documents():
"""Create sample documents for demonstration"""
docs = [
{"title": "Intro to Python", "content": "Python is a high-level, interpreted language known for simplicity."},
{"title": "ML Basics", "content": "Machine learning builds systems that learn from data."},
{"title": "Data Structures", "content": "Data structures like arrays, lists, and graphs organize data."},
]
return docs
def main():
print("==========================================================")
print("=== Leann Document Search Demo (DiskANN + Recompute) ===")
print("==========================================================")
INDEX_DIR = Path("./test_indices")
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
BACKEND_TO_TEST = "diskann"
if INDEX_DIR.exists():
print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
shutil.rmtree(INDEX_DIR)
# --- 1. Build index ---
print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...")
builder = LeannBuilder(
backend_name=BACKEND_TO_TEST,
graph_degree=32,
complexity=64
)
documents = load_sample_documents()
print(f"Loaded {len(documents)} sample documents.")
for doc in documents:
builder.add_text(doc["content"], metadata={"title": doc["title"]})
builder.build_index(INDEX_PATH)
print(f"\nIndex built!")
# --- 2. Basic search demo ---
print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...")
searcher = LeannSearcher(index_path=INDEX_PATH)
query = "What is machine learning?"
print(f"\nQuery: '{query}'")
print("\n--- Basic search mode (PQ computation) ---")
start_time = time.time()
results = searcher.search(query, top_k=2)
basic_time = time.time() - start_time
print(f"⏱️ Basic search time: {basic_time:.3f} seconds")
print(">>> Basic search results <<<")
for i, res in enumerate(results, 1):
print(f" {i}. ID: {res['id']}, Score: {res['score']:.4f}, Text: '{res['text']}', Metadata: {res['metadata']}")
# --- 3. Recompute search demo ---
print(f"\n[PHASE 3] Recompute search using embedding server...")
print("\n--- Recompute search mode (get real embeddings via network) ---")
# Configure recompute parameters
recompute_params = {
"recompute_beighbor_embeddings": True, # Enable network recomputation
"USE_DEFERRED_FETCH": False, # Don't use deferred fetch
"skip_search_reorder": True, # Skip search reordering
"dedup_node_dis": True, # Enable node distance deduplication
"prune_ratio": 0.1, # Pruning ratio 10%
"batch_recompute": False, # Don't use batch recomputation
"global_pruning": False, # Don't use global pruning
"zmq_port": 5555, # ZMQ port
"embedding_model": "sentence-transformers/all-mpnet-base-v2"
}
print("Recompute parameter configuration:")
for key, value in recompute_params.items():
print(f" {key}: {value}")
print(f"\n🔄 Executing Recompute search...")
try:
start_time = time.time()
recompute_results = searcher.search(query, top_k=2, **recompute_params)
recompute_time = time.time() - start_time
print(f"⏱️ Recompute search time: {recompute_time:.3f} seconds")
print(">>> Recompute search results <<<")
for i, res in enumerate(recompute_results, 1):
print(f" {i}. ID: {res['id']}, Score: {res['score']:.4f}, Text: '{res['text']}', Metadata: {res['metadata']}")
# Compare results
print(f"\n--- Result comparison ---")
print(f"Basic search time: {basic_time:.3f} seconds")
print(f"Recompute time: {recompute_time:.3f} seconds")
print("\nBasic search vs Recompute results:")
for i in range(min(len(results), len(recompute_results))):
basic_score = results[i]['score']
recompute_score = recompute_results[i]['score']
score_diff = abs(basic_score - recompute_score)
print(f" Position {i+1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}")
if recompute_time > basic_time:
print(f"✅ Recompute mode working correctly (more accurate but slower)")
else:
print(f" Recompute time is unusually fast, network recomputation may not be enabled")
except Exception as e:
print(f"❌ Recompute search failed: {e}")
print("This usually indicates an embedding server connection issue")
# --- 4. Chat demo ---
print(f"\n[PHASE 4] Starting chat session...")
chat = LeannChat(index_path=INDEX_PATH)
chat_response = chat.ask(query)
print(f"You: {query}")
print(f"Leann: {chat_response}")
print("\n==========================================================")
print("✅ Demo finished successfully!")
print("==========================================================")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,76 @@
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.readers.base import BaseReader
from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.readers.docling import DoclingReader
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
import asyncio
import os
import dotenv
from leann.api import LeannBuilder, LeannSearcher, LeannChat
import leann_backend_diskann # Import to ensure backend registration
import shutil
from pathlib import Path
dotenv.load_dotenv()
reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
file_extractor: dict[str, BaseReader] = {
".docx": reader,
".pptx": reader,
".pdf": reader,
".xlsx": reader,
}
node_parser = DoclingNodeParser(
chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=10240)
)
documents = SimpleDirectoryReader(
"examples/data",
recursive=True,
file_extractor=file_extractor,
encoding="utf-8",
required_exts=[".pdf", ".docx", ".pptx", ".xlsx"]
).load_data(show_progress=True)
# Extract text from documents and prepare for Leann
all_texts = []
for doc in documents:
# DoclingNodeParser returns Node objects, which have a text attribute
nodes = node_parser.get_nodes_from_documents([doc])
for node in nodes:
all_texts.append(node.text)
INDEX_DIR = Path("./test_pdf_index")
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
if INDEX_DIR.exists():
print(f"--- Cleaning up old index directory: {INDEX_DIR} ---")
shutil.rmtree(INDEX_DIR)
print(f"\n[PHASE 1] Building Leann index...")
builder = LeannBuilder(
backend_name="diskann",
embedding_model="sentence-transformers/all-mpnet-base-v2", # Using a common sentence transformer model
graph_degree=32,
complexity=64
)
print(f"Loaded {len(all_texts)} text chunks from documents.")
for chunk_text in all_texts:
builder.add_text(chunk_text)
builder.build_index(INDEX_PATH)
print(f"\nLeann index built at {INDEX_PATH}!")
async def main():
print(f"\n[PHASE 2] Starting Leann chat session...")
chat = LeannChat(index_path=INDEX_PATH)
query = "Based on the paper, what are the two main techniques LEANN uses to achieve low storage overhead and high retrieval accuracy?"
print(f"You: {query}")
chat_response = chat.ask(query, recompute_beighbor_embeddings=True)
print(f"Leann: {chat_response}")
if __name__ == "__main__":
asyncio.run(main())

81
examples/simple_demo.py Normal file
View File

@@ -0,0 +1,81 @@
"""
Simple demo showing basic leann usage
Run: uv run python examples/simple_demo.py
"""
from leann import LeannBuilder, LeannSearcher, LeannChat
def main():
print("=== Leann Simple Demo ===")
print()
# Sample knowledge base
chunks = [
"Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
"Deep learning uses neural networks with multiple layers to process data and make decisions.",
"Natural language processing helps computers understand and generate human language.",
"Computer vision enables machines to interpret and understand visual information from images and videos.",
"Reinforcement learning teaches agents to make decisions by receiving rewards or penalties for their actions.",
"Data science combines statistics, programming, and domain expertise to extract insights from data.",
"Big data refers to extremely large datasets that require special tools and techniques to process.",
"Cloud computing provides on-demand access to computing resources over the internet.",
]
print("1. Building index (no embeddings stored)...")
builder = LeannBuilder(
embedding_model="sentence-transformers/all-mpnet-base-v2",
prune_ratio=0.7, # Keep 30% of connections
)
builder.add_chunks(chunks)
builder.build_index("demo_knowledge.leann")
print()
print("2. Searching with real-time embeddings...")
searcher = LeannSearcher("demo_knowledge.leann")
queries = [
"What is machine learning?",
"How does neural network work?",
"Tell me about data processing",
]
for query in queries:
print(f"Query: {query}")
results = searcher.search(query, top_k=2)
for i, result in enumerate(results, 1):
print(f" {i}. Score: {result.score:.3f}")
print(f" Text: {result.text[:100]}...")
print()
print("3. Memory stats:")
stats = searcher.get_memory_stats()
print(f" Cache size: {stats.embedding_cache_size}")
print(f" Cache memory: {stats.embedding_cache_memory_mb:.1f} MB")
print(f" Total chunks: {stats.total_chunks}")
print()
print("4. Interactive chat demo:")
print(" (Note: Requires OpenAI API key for real responses)")
chat = LeannChat("demo_knowledge.leann")
# Demo questions
demo_questions: list[str] = [
"What is the difference between machine learning and deep learning?",
"How is data science related to big data?",
]
for question in demo_questions:
print(f" Q: {question}")
response = chat.ask(question)
print(f" A: {response}")
print()
print("Demo completed! Try running:")
print(" uv run python examples/document_search.py")
if __name__ == "__main__":
main()