Files
LEANN/packages/leann-backend-diskann/leann_backend_diskann/graph_partition_simple.py
Andy Lee 575b354976 style: organize imports per ruff; finish py39 Optional changes
- Fix import ordering in embedding servers and graph_partition_simple
- Remove duplicate Optional import
- Complete Optional[...] replacements
2025-08-07 15:06:25 -07:00

138 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
Simplified Graph Partition Module for LEANN DiskANN Backend
This module provides a simple Python interface for graph partitioning
that directly calls the existing executables.
"""
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
def partition_graph_simple(
index_prefix_path: str, output_dir: Optional[str] = None, **kwargs
) -> tuple[str, str]:
"""
Simple function to partition a graph index.
Args:
index_prefix_path: Path to the index prefix (e.g., "/path/to/index")
output_dir: Output directory (defaults to parent of index_prefix_path)
**kwargs: Additional parameters for graph partitioning
Returns:
Tuple of (disk_graph_index_path, partition_bin_path)
"""
# Set default parameters
params = {
"gp_times": 10,
"lock_nums": 10,
"cut": 100,
"scale_factor": 1,
"data_type": "float",
"thread_nums": 10,
**kwargs,
}
# Determine output directory
if output_dir is None:
output_dir = str(Path(index_prefix_path).parent)
# Find the graph_partition directory
current_file = Path(__file__)
graph_partition_dir = current_file.parent.parent / "third_party" / "DiskANN" / "graph_partition"
if not graph_partition_dir.exists():
raise RuntimeError(f"Graph partition directory not found: {graph_partition_dir}")
# Find input index file
old_index_file = f"{index_prefix_path}_disk_beam_search.index"
if not os.path.exists(old_index_file):
old_index_file = f"{index_prefix_path}_disk.index"
if not os.path.exists(old_index_file):
raise RuntimeError(f"Index file not found: {old_index_file}")
# Create temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
temp_data_dir = Path(temp_dir) / "data"
temp_data_dir.mkdir(parents=True, exist_ok=True)
# Set up paths for temporary files
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
graph_gp_path = (
graph_path
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
)
graph_gp_path.mkdir(parents=True, exist_ok=True)
# Run the build script with our parameters
cmd = [str(graph_partition_dir / "build.sh"), "release", "split_graph", index_prefix_path]
# Set environment variables for parameters
env = os.environ.copy()
env.update(
{
"GP_TIMES": str(params["gp_times"]),
"GP_LOCK_NUMS": str(params["lock_nums"]),
"GP_CUT": str(params["cut"]),
"GP_SCALE_F": str(params["scale_factor"]),
"DATA_TYPE": params["data_type"],
"GP_T": str(params["thread_nums"]),
}
)
print(f"Running graph partition with command: {' '.join(cmd)}")
print(f"Working directory: {graph_partition_dir}")
# Run the command
result = subprocess.run(
cmd, env=env, capture_output=True, text=True, cwd=graph_partition_dir
)
if result.returncode != 0:
print(f"Command failed with return code {result.returncode}")
print(f"stdout: {result.stdout}")
print(f"stderr: {result.stderr}")
raise RuntimeError(
f"Graph partitioning failed with return code {result.returncode}.\n"
f"stdout: {result.stdout}\n"
f"stderr: {result.stderr}"
)
# Check if output files were created
disk_graph_path = Path(output_dir) / "_disk_graph.index"
partition_bin_path = Path(output_dir) / "_partition.bin"
if not disk_graph_path.exists():
raise RuntimeError(f"Expected output file not found: {disk_graph_path}")
if not partition_bin_path.exists():
raise RuntimeError(f"Expected output file not found: {partition_bin_path}")
print("✅ Partitioning completed successfully!")
print(f" Disk graph index: {disk_graph_path}")
print(f" Partition binary: {partition_bin_path}")
return str(disk_graph_path), str(partition_bin_path)
# Example usage
if __name__ == "__main__":
try:
disk_graph_path, partition_bin_path = partition_graph_simple(
"/Users/yichuan/Desktop/release2/leann/diskannbuild/test_doc_files",
gp_times=5,
lock_nums=5,
cut=50,
)
print("Success! Output files:")
print(f" - {disk_graph_path}")
print(f" - {partition_bin_path}")
except Exception as e:
print(f"Error: {e}")