docs: embedding pruning
This commit is contained in:
@@ -468,16 +468,27 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
# --- Write CSR HNSW graph data using unified function ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Writing CSR HNSW graph data in FAISS-compatible order...")
|
||||
|
||||
# Determine storage fourcc based on prune_embeddings
|
||||
output_storage_fourcc = NULL_INDEX_FOURCC if prune_embeddings else (storage_fourcc if 'storage_fourcc' in locals() else NULL_INDEX_FOURCC)
|
||||
# Determine storage fourcc and data based on prune_embeddings
|
||||
if prune_embeddings:
|
||||
print(f" Pruning embeddings: Writing NULL storage marker.")
|
||||
storage_data = b''
|
||||
output_storage_fourcc = NULL_INDEX_FOURCC
|
||||
storage_data = b''
|
||||
else:
|
||||
# Keep embeddings - read and preserve original storage data
|
||||
if storage_fourcc and storage_fourcc != NULL_INDEX_FOURCC:
|
||||
print(f" Preserving embeddings: Reading original storage data...")
|
||||
storage_data = f_in.read() # Read remaining storage data
|
||||
output_storage_fourcc = storage_fourcc
|
||||
print(f" Read {len(storage_data)} bytes of storage data")
|
||||
else:
|
||||
print(f" No embeddings found in original file (NULL storage)")
|
||||
output_storage_fourcc = NULL_INDEX_FOURCC
|
||||
storage_data = b''
|
||||
|
||||
# Use the unified write function
|
||||
write_compact_format(f_out, original_hnsw_data, assign_probas_np, cum_nneighbor_per_level_np,
|
||||
levels_np, compact_level_ptr, compact_node_offsets_np,
|
||||
compact_neighbors_data, output_storage_fourcc, storage_data if not prune_embeddings else b'')
|
||||
compact_neighbors_data, output_storage_fourcc, storage_data)
|
||||
|
||||
# Clean up memory
|
||||
del assign_probas_np, cum_nneighbor_per_level_np, levels_np
|
||||
|
||||
68
tests/sanity_checks/README_hnsw_pruning.md
Normal file
68
tests/sanity_checks/README_hnsw_pruning.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# HNSW Index Storage Optimization
|
||||
|
||||
This document explains the storage optimization features available in the HNSW backend.
|
||||
|
||||
## Storage Modes
|
||||
|
||||
The HNSW backend supports two orthogonal optimization techniques:
|
||||
|
||||
### 1. CSR Compression (`is_compact=True`)
|
||||
- Converts the graph structure from standard format to Compressed Sparse Row (CSR) format
|
||||
- Reduces memory overhead from graph adjacency storage
|
||||
- Maintains all embedding data for direct access
|
||||
|
||||
### 2. Embedding Pruning (`is_recompute=True`)
|
||||
- Removes embedding vectors from the index file
|
||||
- Replaces them with a NULL storage marker
|
||||
- Requires recomputation via embedding server during search
|
||||
- Must be used with `is_compact=True` for efficiency
|
||||
|
||||
## Performance Impact
|
||||
|
||||
**Storage Reduction (100 vectors, 384 dimensions):**
|
||||
```
|
||||
Standard format: 168 KB (embeddings + graph)
|
||||
CSR only: 160 KB (embeddings + compressed graph)
|
||||
CSR + Pruned: 6 KB (compressed graph only)
|
||||
```
|
||||
|
||||
**Key Benefits:**
|
||||
- **CSR compression**: ~5% size reduction from graph optimization
|
||||
- **Embedding pruning**: ~95% size reduction by removing embeddings
|
||||
- **Combined**: Up to 96% total storage reduction
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
# Standard format (largest)
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
is_compact=False,
|
||||
is_recompute=False
|
||||
)
|
||||
|
||||
# CSR compressed (medium)
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
is_compact=True,
|
||||
is_recompute=False
|
||||
)
|
||||
|
||||
# CSR + Pruned (smallest, requires embedding server)
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
is_compact=True, # Required for pruning
|
||||
is_recompute=True # Default: enabled
|
||||
)
|
||||
```
|
||||
|
||||
## Trade-offs
|
||||
|
||||
| Mode | Storage | Search Speed | Memory Usage | Setup Complexity |
|
||||
|------|---------|--------------|--------------|------------------|
|
||||
| Standard | Largest | Fastest | Highest | Simple |
|
||||
| CSR | Medium | Fast | Medium | Simple |
|
||||
| CSR + Pruned | Smallest | Slower* | Lowest | Complex** |
|
||||
|
||||
*Requires network round-trip to embedding server for recomputation
|
||||
**Needs embedding server and passages file for search
|
||||
156
tests/sanity_checks/test_hnsw_pruning.py
Normal file
156
tests/sanity_checks/test_hnsw_pruning.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sanity check script to verify HNSW index pruning effectiveness.
|
||||
Tests the difference in file sizes between pruned and non-pruned indices.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
# Add the project root to the Python path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
# Import backend packages to trigger plugin registration
|
||||
import leann_backend_hnsw
|
||||
|
||||
from leann.api import LeannBuilder
|
||||
|
||||
def create_sample_documents(num_docs=1000):
|
||||
"""Create sample documents for testing"""
|
||||
documents = []
|
||||
for i in range(num_docs):
|
||||
documents.append(f"Sample document {i} with some random text content for testing purposes.")
|
||||
return documents
|
||||
|
||||
def build_index(documents, output_dir, is_recompute=True):
|
||||
"""Build HNSW index with specified recompute setting"""
|
||||
index_path = os.path.join(output_dir, "test_index.hnsw")
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
||||
M=16,
|
||||
efConstruction=100,
|
||||
distance_metric="mips",
|
||||
is_compact=True,
|
||||
is_recompute=is_recompute
|
||||
)
|
||||
|
||||
for doc in documents:
|
||||
builder.add_text(doc)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
return index_path
|
||||
|
||||
def get_file_size(filepath):
|
||||
"""Get file size in bytes"""
|
||||
return os.path.getsize(filepath)
|
||||
|
||||
def main():
|
||||
print("🔍 HNSW Pruning Sanity Check")
|
||||
print("=" * 50)
|
||||
|
||||
# Create sample data
|
||||
print("📊 Creating sample documents...")
|
||||
documents = create_sample_documents(num_docs=1000)
|
||||
print(f" Number of documents: {len(documents)}")
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
print(f"📁 Working in temporary directory: {temp_dir}")
|
||||
|
||||
# Build index with pruning (is_recompute=True)
|
||||
print("\n🔨 Building index with pruning enabled (is_recompute=True)...")
|
||||
pruned_dir = os.path.join(temp_dir, "pruned")
|
||||
os.makedirs(pruned_dir, exist_ok=True)
|
||||
|
||||
pruned_index_path = build_index(documents, pruned_dir, is_recompute=True)
|
||||
# Check what files were actually created
|
||||
print(f" Looking for index files at: {pruned_index_path}")
|
||||
import glob
|
||||
files = glob.glob(f"{pruned_index_path}*")
|
||||
print(f" Found files: {files}")
|
||||
|
||||
# Try to find the actual index file
|
||||
if os.path.exists(f"{pruned_index_path}.index"):
|
||||
pruned_index_file = f"{pruned_index_path}.index"
|
||||
else:
|
||||
# Look for any .index file in the directory
|
||||
index_files = glob.glob(f"{pruned_dir}/*.index")
|
||||
if index_files:
|
||||
pruned_index_file = index_files[0]
|
||||
else:
|
||||
raise FileNotFoundError(f"No .index file found in {pruned_dir}")
|
||||
|
||||
pruned_size = get_file_size(pruned_index_file)
|
||||
print(f" ✅ Pruned index built successfully")
|
||||
print(f" 📏 Pruned index size: {pruned_size:,} bytes ({pruned_size/1024:.1f} KB)")
|
||||
|
||||
# Build index without pruning (is_recompute=False)
|
||||
print("\n🔨 Building index without pruning (is_recompute=False)...")
|
||||
non_pruned_dir = os.path.join(temp_dir, "non_pruned")
|
||||
os.makedirs(non_pruned_dir, exist_ok=True)
|
||||
|
||||
non_pruned_index_path = build_index(documents, non_pruned_dir, is_recompute=False)
|
||||
# Check what files were actually created
|
||||
print(f" Looking for index files at: {non_pruned_index_path}")
|
||||
files = glob.glob(f"{non_pruned_index_path}*")
|
||||
print(f" Found files: {files}")
|
||||
|
||||
# Try to find the actual index file
|
||||
if os.path.exists(f"{non_pruned_index_path}.index"):
|
||||
non_pruned_index_file = f"{non_pruned_index_path}.index"
|
||||
else:
|
||||
# Look for any .index file in the directory
|
||||
index_files = glob.glob(f"{non_pruned_dir}/*.index")
|
||||
if index_files:
|
||||
non_pruned_index_file = index_files[0]
|
||||
else:
|
||||
raise FileNotFoundError(f"No .index file found in {non_pruned_dir}")
|
||||
|
||||
non_pruned_size = get_file_size(non_pruned_index_file)
|
||||
print(f" ✅ Non-pruned index built successfully")
|
||||
print(f" 📏 Non-pruned index size: {non_pruned_size:,} bytes ({non_pruned_size/1024:.1f} KB)")
|
||||
|
||||
# Compare sizes
|
||||
print("\n📊 Comparison Results:")
|
||||
print("=" * 30)
|
||||
size_diff = non_pruned_size - pruned_size
|
||||
size_ratio = pruned_size / non_pruned_size if non_pruned_size > 0 else 0
|
||||
reduction_percent = (1 - size_ratio) * 100
|
||||
|
||||
print(f"Non-pruned index: {non_pruned_size:,} bytes ({non_pruned_size/1024:.1f} KB)")
|
||||
print(f"Pruned index: {pruned_size:,} bytes ({pruned_size/1024:.1f} KB)")
|
||||
print(f"Size difference: {size_diff:,} bytes ({size_diff/1024:.1f} KB)")
|
||||
print(f"Size ratio: {size_ratio:.3f}")
|
||||
print(f"Size reduction: {reduction_percent:.1f}%")
|
||||
|
||||
# Verify pruning effectiveness
|
||||
print("\n🔍 Verification:")
|
||||
if size_diff > 0:
|
||||
print(" ✅ Pruning is effective - pruned index is smaller")
|
||||
if reduction_percent > 10:
|
||||
print(f" ✅ Significant size reduction: {reduction_percent:.1f}%")
|
||||
else:
|
||||
print(f" ⚠️ Small size reduction: {reduction_percent:.1f}%")
|
||||
else:
|
||||
print(" ❌ Pruning appears ineffective - no size reduction")
|
||||
|
||||
# Check if passages files were created
|
||||
pruned_passages = f"{pruned_index_path}.passages.json"
|
||||
non_pruned_passages = f"{non_pruned_index_path}.passages.json"
|
||||
|
||||
print(f"\n📄 Passages files:")
|
||||
print(f" Pruned passages file exists: {os.path.exists(pruned_passages)}")
|
||||
print(f" Non-pruned passages file exists: {os.path.exists(non_pruned_passages)}")
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user