This commit is contained in:
yichuan520030910320
2025-08-05 23:23:07 -07:00
parent a72090d2ab
commit 4a1353761a
3 changed files with 124 additions and 144 deletions

View File

@@ -4,9 +4,4 @@ from . import graph_partition
# Export main classes and functions # Export main classes and functions
from .graph_partition import GraphPartitioner, partition_graph from .graph_partition import GraphPartitioner, partition_graph
__all__ = [ __all__ = ["diskann_backend", "graph_partition", "GraphPartitioner", "partition_graph"]
"diskann_backend",
"graph_partition",
"GraphPartitioner",
"partition_graph"
]

View File

@@ -8,11 +8,10 @@ performance.
""" """
import os import os
import shutil
import subprocess import subprocess
import tempfile import tempfile
import shutil
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple
class GraphPartitioner: class GraphPartitioner:
@@ -51,14 +50,16 @@ class GraphPartitioner:
try: try:
self._get_executable_path("partitioner") self._get_executable_path("partitioner")
self._get_executable_path("index_relayout") self._get_executable_path("index_relayout")
except FileNotFoundError as e: except FileNotFoundError:
# Try to build the executables automatically # Try to build the executables automatically
print("Executables not found, attempting to build them...") print("Executables not found, attempting to build them...")
self._build_executables() self._build_executables()
def _build_executables(self): def _build_executables(self):
"""Build the required executables.""" """Build the required executables."""
graph_partition_dir = Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition" graph_partition_dir = (
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
)
original_dir = os.getcwd() original_dir = os.getcwd()
try: try:
@@ -70,12 +71,7 @@ class GraphPartitioner:
# Run the build script # Run the build script
cmd = ["./build.sh", self.build_type, "split_graph", "/tmp/dummy"] cmd = ["./build.sh", self.build_type, "split_graph", "/tmp/dummy"]
result = subprocess.run( subprocess.run(cmd, capture_output=True, text=True, cwd=graph_partition_dir)
cmd,
capture_output=True,
text=True,
cwd=graph_partition_dir
)
# Check if executables were created # Check if executables were created
partitioner_path = self._get_executable_path("partitioner") partitioner_path = self._get_executable_path("partitioner")
@@ -90,11 +86,8 @@ class GraphPartitioner:
os.chdir(original_dir) os.chdir(original_dir)
def partition_graph( def partition_graph(
self, self, index_prefix_path: str, output_dir: str | None = None, **kwargs
index_prefix_path: str, ) -> tuple[str, str]:
output_dir: Optional[str] = None,
**kwargs
) -> Tuple[str, str]:
""" """
Partition a disk-based index for improved performance. Partition a disk-based index for improved performance.
@@ -123,7 +116,7 @@ class GraphPartitioner:
"scale_factor": 1, "scale_factor": 1,
"data_type": "float", "data_type": "float",
"thread_nums": 10, "thread_nums": 10,
**kwargs **kwargs,
} }
# Determine output directory # Determine output directory
@@ -137,7 +130,9 @@ class GraphPartitioner:
# Create temporary directory for processing # Create temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
# Change to the graph_partition directory for temporary files # Change to the graph_partition directory for temporary files
graph_partition_dir = Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition" graph_partition_dir = (
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
)
original_dir = os.getcwd() original_dir = os.getcwd()
try: try:
@@ -149,7 +144,10 @@ class GraphPartitioner:
# Set up paths for temporary files # Set up paths for temporary files
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH" graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
graph_gp_path = graph_path / f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}" graph_gp_path = (
graph_path
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
)
graph_gp_path.mkdir(parents=True, exist_ok=True) graph_gp_path.mkdir(parents=True, exist_ok=True)
# Find input index file # Find input index file
@@ -164,21 +162,25 @@ class GraphPartitioner:
gp_file_path = graph_gp_path / "_part.bin" gp_file_path = graph_gp_path / "_part.bin"
partitioner_cmd = [ partitioner_cmd = [
partitioner_path, partitioner_path,
"--index_file", old_index_file, "--index_file",
"--data_type", params["data_type"], old_index_file,
"--gp_file", str(gp_file_path), "--data_type",
"-T", str(params["thread_nums"]), params["data_type"],
"--ldg_times", str(params["gp_times"]), "--gp_file",
"--scale", str(params["scale_factor"]), str(gp_file_path),
"--mode", "1" "-T",
str(params["thread_nums"]),
"--ldg_times",
str(params["gp_times"]),
"--scale",
str(params["scale_factor"]),
"--mode",
"1",
] ]
print(f"Running partitioner: {' '.join(partitioner_cmd)}") print(f"Running partitioner: {' '.join(partitioner_cmd)}")
result = subprocess.run( result = subprocess.run(
partitioner_cmd, partitioner_cmd, capture_output=True, text=True, cwd=graph_partition_dir
capture_output=True,
text=True,
cwd=graph_partition_dir
) )
if result.returncode != 0: if result.returncode != 0:
@@ -195,15 +197,12 @@ class GraphPartitioner:
old_index_file, old_index_file,
str(gp_file_path), str(gp_file_path),
params["data_type"], params["data_type"],
"1" "1",
] ]
print(f"Running relayout: {' '.join(relayout_cmd)}") print(f"Running relayout: {' '.join(relayout_cmd)}")
result = subprocess.run( result = subprocess.run(
relayout_cmd, relayout_cmd, capture_output=True, text=True, cwd=graph_partition_dir
capture_output=True,
text=True,
cwd=graph_partition_dir
) )
if result.returncode != 0: if result.returncode != 0:
@@ -245,16 +244,13 @@ class GraphPartitioner:
return { return {
"file_size": stat.st_size, "file_size": stat.st_size,
"file_path": partition_bin_path, "file_path": partition_bin_path,
"modified_time": stat.st_mtime "modified_time": stat.st_mtime,
} }
def partition_graph( def partition_graph(
index_prefix_path: str, index_prefix_path: str, output_dir: str | None = None, build_type: str = "release", **kwargs
output_dir: Optional[str] = None, ) -> tuple[str, str]:
build_type: str = "release",
**kwargs
) -> Tuple[str, str]:
""" """
Convenience function to partition a graph index. Convenience function to partition a graph index.
@@ -276,12 +272,9 @@ if __name__ == "__main__":
# Example: partition an index # Example: partition an index
try: try:
disk_graph_path, partition_bin_path = partition_graph( disk_graph_path, partition_bin_path = partition_graph(
"/path/to/your/index_prefix", "/path/to/your/index_prefix", gp_times=10, lock_nums=10, cut=100
gp_times=10,
lock_nums=10,
cut=100
) )
print(f"Partitioning completed successfully!") print("Partitioning completed successfully!")
print(f"Disk graph index: {disk_graph_path}") print(f"Disk graph index: {disk_graph_path}")
print(f"Partition binary: {partition_bin_path}") print(f"Partition binary: {partition_bin_path}")
except Exception as e: except Exception as e:

View File

@@ -9,16 +9,12 @@ that directly calls the existing executables.
import os import os
import subprocess import subprocess
import tempfile import tempfile
import shutil
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple
def partition_graph_simple( def partition_graph_simple(
index_prefix_path: str, index_prefix_path: str, output_dir: str | None = None, **kwargs
output_dir: Optional[str] = None, ) -> tuple[str, str]:
**kwargs
) -> Tuple[str, str]:
""" """
Simple function to partition a graph index. Simple function to partition a graph index.
@@ -38,7 +34,7 @@ def partition_graph_simple(
"scale_factor": 1, "scale_factor": 1,
"data_type": "float", "data_type": "float",
"thread_nums": 10, "thread_nums": 10,
**kwargs **kwargs,
} }
# Determine output directory # Determine output directory
@@ -67,38 +63,34 @@ def partition_graph_simple(
# Set up paths for temporary files # Set up paths for temporary files
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH" graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
graph_gp_path = graph_path / f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}" graph_gp_path = (
graph_path
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
)
graph_gp_path.mkdir(parents=True, exist_ok=True) graph_gp_path.mkdir(parents=True, exist_ok=True)
# Run the build script with our parameters # Run the build script with our parameters
cmd = [ cmd = [str(graph_partition_dir / "build.sh"), "release", "split_graph", index_prefix_path]
str(graph_partition_dir / "build.sh"),
"release",
"split_graph",
index_prefix_path
]
# Set environment variables for parameters # Set environment variables for parameters
env = os.environ.copy() env = os.environ.copy()
env.update({ env.update(
"GP_TIMES": str(params["gp_times"]), {
"GP_LOCK_NUMS": str(params["lock_nums"]), "GP_TIMES": str(params["gp_times"]),
"GP_CUT": str(params["cut"]), "GP_LOCK_NUMS": str(params["lock_nums"]),
"GP_SCALE_F": str(params["scale_factor"]), "GP_CUT": str(params["cut"]),
"DATA_TYPE": params["data_type"], "GP_SCALE_F": str(params["scale_factor"]),
"GP_T": str(params["thread_nums"]), "DATA_TYPE": params["data_type"],
}) "GP_T": str(params["thread_nums"]),
}
)
print(f"Running graph partition with command: {' '.join(cmd)}") print(f"Running graph partition with command: {' '.join(cmd)}")
print(f"Working directory: {graph_partition_dir}") print(f"Working directory: {graph_partition_dir}")
# Run the command # Run the command
result = subprocess.run( result = subprocess.run(
cmd, cmd, env=env, capture_output=True, text=True, cwd=graph_partition_dir
env=env,
capture_output=True,
text=True,
cwd=graph_partition_dir
) )
if result.returncode != 0: if result.returncode != 0:
@@ -121,7 +113,7 @@ def partition_graph_simple(
if not partition_bin_path.exists(): if not partition_bin_path.exists():
raise RuntimeError(f"Expected output file not found: {partition_bin_path}") raise RuntimeError(f"Expected output file not found: {partition_bin_path}")
print(f"✅ Partitioning completed successfully!") print("✅ Partitioning completed successfully!")
print(f" Disk graph index: {disk_graph_path}") print(f" Disk graph index: {disk_graph_path}")
print(f" Partition binary: {partition_bin_path}") print(f" Partition binary: {partition_bin_path}")
@@ -135,9 +127,9 @@ if __name__ == "__main__":
"/Users/yichuan/Desktop/release2/leann/diskannbuild/test_doc_files", "/Users/yichuan/Desktop/release2/leann/diskannbuild/test_doc_files",
gp_times=5, gp_times=5,
lock_nums=5, lock_nums=5,
cut=50 cut=50,
) )
print(f"Success! Output files:") print("Success! Output files:")
print(f" - {disk_graph_path}") print(f" - {disk_graph_path}")
print(f" - {partition_bin_path}") print(f" - {partition_bin_path}")
except Exception as e: except Exception as e: