docs: embedding pruning

2025-07-06 19:50:01 +00:00
parent b4ae57b2c0
commit 5611f708e9
3 changed files with 239 additions and 4 deletions
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/convert_to_csr.py
@@ -468,16 +468,27 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
            # --- Write CSR HNSW graph data using unified function ---
            print(f"[{time.time() - start_time:.2f}s] Writing CSR HNSW graph data in FAISS-compatible order...")
            
-            # Determine storage fourcc based on prune_embeddings
-            output_storage_fourcc = NULL_INDEX_FOURCC if prune_embeddings else (storage_fourcc if 'storage_fourcc' in locals() else NULL_INDEX_FOURCC)
+            # Determine storage fourcc and data based on prune_embeddings
            if prune_embeddings:
                print(f"   Pruning embeddings: Writing NULL storage marker.")
-            storage_data = b''
+                output_storage_fourcc = NULL_INDEX_FOURCC
+                storage_data = b''
+            else:
+                # Keep embeddings - read and preserve original storage data
+                if storage_fourcc and storage_fourcc != NULL_INDEX_FOURCC:
+                    print(f"   Preserving embeddings: Reading original storage data...")
+                    storage_data = f_in.read()  # Read remaining storage data
+                    output_storage_fourcc = storage_fourcc
+                    print(f"   Read {len(storage_data)} bytes of storage data")
+                else:
+                    print(f"   No embeddings found in original file (NULL storage)")
+                    output_storage_fourcc = NULL_INDEX_FOURCC
+                    storage_data = b''
            
            # Use the unified write function
            write_compact_format(f_out, original_hnsw_data, assign_probas_np, cum_nneighbor_per_level_np, 
                               levels_np, compact_level_ptr, compact_node_offsets_np, 
-                               compact_neighbors_data, output_storage_fourcc, storage_data if not prune_embeddings else b'')
+                               compact_neighbors_data, output_storage_fourcc, storage_data)
            
            # Clean up memory
            del assign_probas_np, cum_nneighbor_per_level_np, levels_np
--- a/tests/sanity_checks/README_hnsw_pruning.md
+++ b/tests/sanity_checks/README_hnsw_pruning.md
@@ -0,0 +1,68 @@
+# HNSW Index Storage Optimization
+
+This document explains the storage optimization features available in the HNSW backend.
+
+## Storage Modes
+
+The HNSW backend supports two orthogonal optimization techniques:
+
+### 1. CSR Compression (`is_compact=True`)
+- Converts the graph structure from standard format to Compressed Sparse Row (CSR) format
+- Reduces memory overhead from graph adjacency storage
+- Maintains all embedding data for direct access
+
+### 2. Embedding Pruning (`is_recompute=True`) 
+- Removes embedding vectors from the index file
+- Replaces them with a NULL storage marker
+- Requires recomputation via embedding server during search
+- Must be used with `is_compact=True` for efficiency
+
+## Performance Impact
+
+**Storage Reduction (100 vectors, 384 dimensions):**
+```
+Standard format:     168 KB (embeddings + graph)
+CSR only:           160 KB (embeddings + compressed graph)  
+CSR + Pruned:         6 KB (compressed graph only)
+```
+
+**Key Benefits:**
+- **CSR compression**: ~5% size reduction from graph optimization
+- **Embedding pruning**: ~95% size reduction by removing embeddings
+- **Combined**: Up to 96% total storage reduction
+
+## Usage
+
+```python
+# Standard format (largest)
+builder = LeannBuilder(
+    backend_name="hnsw",
+    is_compact=False,
+    is_recompute=False
+)
+
+# CSR compressed (medium)
+builder = LeannBuilder(
+    backend_name="hnsw", 
+    is_compact=True,
+    is_recompute=False
+)
+
+# CSR + Pruned (smallest, requires embedding server)
+builder = LeannBuilder(
+    backend_name="hnsw",
+    is_compact=True,      # Required for pruning
+    is_recompute=True     # Default: enabled
+)
+```
+
+## Trade-offs
+
+| Mode | Storage | Search Speed | Memory Usage | Setup Complexity |
+|------|---------|--------------|--------------|------------------|
+| Standard | Largest | Fastest | Highest | Simple |
+| CSR | Medium | Fast | Medium | Simple |
+| CSR + Pruned | Smallest | Slower* | Lowest | Complex** |
+
+*Requires network round-trip to embedding server for recomputation  
+**Needs embedding server and passages file for search
--- a/tests/sanity_checks/test_hnsw_pruning.py
+++ b/tests/sanity_checks/test_hnsw_pruning.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Sanity check script to verify HNSW index pruning effectiveness.
+Tests the difference in file sizes between pruned and non-pruned indices.
+"""
+
+import os
+import sys
+import tempfile
+from pathlib import Path
+import numpy as np
+import json
+
+# Add the project root to the Python path
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+# Import backend packages to trigger plugin registration
+import leann_backend_hnsw
+
+from leann.api import LeannBuilder
+
+def create_sample_documents(num_docs=1000):
+    """Create sample documents for testing"""
+    documents = []
+    for i in range(num_docs):
+        documents.append(f"Sample document {i} with some random text content for testing purposes.")
+    return documents
+
+def build_index(documents, output_dir, is_recompute=True):
+    """Build HNSW index with specified recompute setting"""
+    index_path = os.path.join(output_dir, "test_index.hnsw")
+    
+    builder = LeannBuilder(
+        backend_name="hnsw",
+        embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+        M=16,
+        efConstruction=100,
+        distance_metric="mips",
+        is_compact=True,
+        is_recompute=is_recompute
+    )
+    
+    for doc in documents:
+        builder.add_text(doc)
+    
+    builder.build_index(index_path)
+    
+    return index_path
+
+def get_file_size(filepath):
+    """Get file size in bytes"""
+    return os.path.getsize(filepath)
+
+def main():
+    print("🔍 HNSW Pruning Sanity Check")
+    print("=" * 50)
+    
+    # Create sample data
+    print("📊 Creating sample documents...")
+    documents = create_sample_documents(num_docs=1000)
+    print(f"   Number of documents: {len(documents)}")
+    
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"📁 Working in temporary directory: {temp_dir}")
+        
+        # Build index with pruning (is_recompute=True)
+        print("\n🔨 Building index with pruning enabled (is_recompute=True)...")
+        pruned_dir = os.path.join(temp_dir, "pruned")
+        os.makedirs(pruned_dir, exist_ok=True)
+        
+        pruned_index_path = build_index(documents, pruned_dir, is_recompute=True)
+        # Check what files were actually created
+        print(f"   Looking for index files at: {pruned_index_path}")
+        import glob
+        files = glob.glob(f"{pruned_index_path}*")
+        print(f"   Found files: {files}")
+        
+        # Try to find the actual index file
+        if os.path.exists(f"{pruned_index_path}.index"):
+            pruned_index_file = f"{pruned_index_path}.index"
+        else:
+            # Look for any .index file in the directory
+            index_files = glob.glob(f"{pruned_dir}/*.index")
+            if index_files:
+                pruned_index_file = index_files[0]
+            else:
+                raise FileNotFoundError(f"No .index file found in {pruned_dir}")
+        
+        pruned_size = get_file_size(pruned_index_file)
+        print(f"   ✅ Pruned index built successfully")
+        print(f"   📏 Pruned index size: {pruned_size:,} bytes ({pruned_size/1024:.1f} KB)")
+        
+        # Build index without pruning (is_recompute=False)
+        print("\n🔨 Building index without pruning (is_recompute=False)...")
+        non_pruned_dir = os.path.join(temp_dir, "non_pruned")
+        os.makedirs(non_pruned_dir, exist_ok=True)
+        
+        non_pruned_index_path = build_index(documents, non_pruned_dir, is_recompute=False)
+        # Check what files were actually created
+        print(f"   Looking for index files at: {non_pruned_index_path}")
+        files = glob.glob(f"{non_pruned_index_path}*")
+        print(f"   Found files: {files}")
+        
+        # Try to find the actual index file
+        if os.path.exists(f"{non_pruned_index_path}.index"):
+            non_pruned_index_file = f"{non_pruned_index_path}.index"
+        else:
+            # Look for any .index file in the directory
+            index_files = glob.glob(f"{non_pruned_dir}/*.index")
+            if index_files:
+                non_pruned_index_file = index_files[0]
+            else:
+                raise FileNotFoundError(f"No .index file found in {non_pruned_dir}")
+        
+        non_pruned_size = get_file_size(non_pruned_index_file)
+        print(f"   ✅ Non-pruned index built successfully")
+        print(f"   📏 Non-pruned index size: {non_pruned_size:,} bytes ({non_pruned_size/1024:.1f} KB)")
+        
+        # Compare sizes
+        print("\n📊 Comparison Results:")
+        print("=" * 30)
+        size_diff = non_pruned_size - pruned_size
+        size_ratio = pruned_size / non_pruned_size if non_pruned_size > 0 else 0
+        reduction_percent = (1 - size_ratio) * 100
+        
+        print(f"Non-pruned index: {non_pruned_size:,} bytes ({non_pruned_size/1024:.1f} KB)")
+        print(f"Pruned index:     {pruned_size:,} bytes ({pruned_size/1024:.1f} KB)")
+        print(f"Size difference:  {size_diff:,} bytes ({size_diff/1024:.1f} KB)")
+        print(f"Size ratio:       {size_ratio:.3f}")
+        print(f"Size reduction:   {reduction_percent:.1f}%")
+        
+        # Verify pruning effectiveness
+        print("\n🔍 Verification:")
+        if size_diff > 0:
+            print("   ✅ Pruning is effective - pruned index is smaller")
+            if reduction_percent > 10:
+                print(f"   ✅ Significant size reduction: {reduction_percent:.1f}%")
+            else:
+                print(f"   ⚠️  Small size reduction: {reduction_percent:.1f}%")
+        else:
+            print("   ❌ Pruning appears ineffective - no size reduction")
+        
+        # Check if passages files were created
+        pruned_passages = f"{pruned_index_path}.passages.json"
+        non_pruned_passages = f"{non_pruned_index_path}.passages.json"
+        
+        print(f"\n📄 Passages files:")
+        print(f"   Pruned passages file exists: {os.path.exists(pruned_passages)}")
+        print(f"   Non-pruned passages file exists: {os.path.exists(non_pruned_passages)}")
+        
+        return True
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)