merge
This commit is contained in:
@@ -4,9 +4,4 @@ from . import graph_partition
|
||||
# Export main classes and functions
|
||||
from .graph_partition import GraphPartitioner, partition_graph
|
||||
|
||||
__all__ = [
|
||||
"diskann_backend",
|
||||
"graph_partition",
|
||||
"GraphPartitioner",
|
||||
"partition_graph"
|
||||
]
|
||||
__all__ = ["diskann_backend", "graph_partition", "GraphPartitioner", "partition_graph"]
|
||||
|
||||
@@ -8,11 +8,10 @@ performance.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
class GraphPartitioner:
|
||||
@@ -51,14 +50,16 @@ class GraphPartitioner:
|
||||
try:
|
||||
self._get_executable_path("partitioner")
|
||||
self._get_executable_path("index_relayout")
|
||||
except FileNotFoundError as e:
|
||||
except FileNotFoundError:
|
||||
# Try to build the executables automatically
|
||||
print("Executables not found, attempting to build them...")
|
||||
self._build_executables()
|
||||
|
||||
def _build_executables(self):
|
||||
"""Build the required executables."""
|
||||
graph_partition_dir = Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
graph_partition_dir = (
|
||||
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
)
|
||||
original_dir = os.getcwd()
|
||||
|
||||
try:
|
||||
@@ -70,12 +71,7 @@ class GraphPartitioner:
|
||||
|
||||
# Run the build script
|
||||
cmd = ["./build.sh", self.build_type, "split_graph", "/tmp/dummy"]
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=graph_partition_dir
|
||||
)
|
||||
subprocess.run(cmd, capture_output=True, text=True, cwd=graph_partition_dir)
|
||||
|
||||
# Check if executables were created
|
||||
partitioner_path = self._get_executable_path("partitioner")
|
||||
@@ -90,11 +86,8 @@ class GraphPartitioner:
|
||||
os.chdir(original_dir)
|
||||
|
||||
def partition_graph(
|
||||
self,
|
||||
index_prefix_path: str,
|
||||
output_dir: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> Tuple[str, str]:
|
||||
self, index_prefix_path: str, output_dir: str | None = None, **kwargs
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Partition a disk-based index for improved performance.
|
||||
|
||||
@@ -123,7 +116,7 @@ class GraphPartitioner:
|
||||
"scale_factor": 1,
|
||||
"data_type": "float",
|
||||
"thread_nums": 10,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Determine output directory
|
||||
@@ -137,7 +130,9 @@ class GraphPartitioner:
|
||||
# Create temporary directory for processing
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Change to the graph_partition directory for temporary files
|
||||
graph_partition_dir = Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
graph_partition_dir = (
|
||||
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
)
|
||||
original_dir = os.getcwd()
|
||||
|
||||
try:
|
||||
@@ -149,7 +144,10 @@ class GraphPartitioner:
|
||||
|
||||
# Set up paths for temporary files
|
||||
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
|
||||
graph_gp_path = graph_path / f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
graph_gp_path = (
|
||||
graph_path
|
||||
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
)
|
||||
graph_gp_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find input index file
|
||||
@@ -164,21 +162,25 @@ class GraphPartitioner:
|
||||
gp_file_path = graph_gp_path / "_part.bin"
|
||||
partitioner_cmd = [
|
||||
partitioner_path,
|
||||
"--index_file", old_index_file,
|
||||
"--data_type", params["data_type"],
|
||||
"--gp_file", str(gp_file_path),
|
||||
"-T", str(params["thread_nums"]),
|
||||
"--ldg_times", str(params["gp_times"]),
|
||||
"--scale", str(params["scale_factor"]),
|
||||
"--mode", "1"
|
||||
"--index_file",
|
||||
old_index_file,
|
||||
"--data_type",
|
||||
params["data_type"],
|
||||
"--gp_file",
|
||||
str(gp_file_path),
|
||||
"-T",
|
||||
str(params["thread_nums"]),
|
||||
"--ldg_times",
|
||||
str(params["gp_times"]),
|
||||
"--scale",
|
||||
str(params["scale_factor"]),
|
||||
"--mode",
|
||||
"1",
|
||||
]
|
||||
|
||||
print(f"Running partitioner: {' '.join(partitioner_cmd)}")
|
||||
result = subprocess.run(
|
||||
partitioner_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=graph_partition_dir
|
||||
partitioner_cmd, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -195,15 +197,12 @@ class GraphPartitioner:
|
||||
old_index_file,
|
||||
str(gp_file_path),
|
||||
params["data_type"],
|
||||
"1"
|
||||
"1",
|
||||
]
|
||||
|
||||
print(f"Running relayout: {' '.join(relayout_cmd)}")
|
||||
result = subprocess.run(
|
||||
relayout_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=graph_partition_dir
|
||||
relayout_cmd, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -245,16 +244,13 @@ class GraphPartitioner:
|
||||
return {
|
||||
"file_size": stat.st_size,
|
||||
"file_path": partition_bin_path,
|
||||
"modified_time": stat.st_mtime
|
||||
"modified_time": stat.st_mtime,
|
||||
}
|
||||
|
||||
|
||||
def partition_graph(
|
||||
index_prefix_path: str,
|
||||
output_dir: Optional[str] = None,
|
||||
build_type: str = "release",
|
||||
**kwargs
|
||||
) -> Tuple[str, str]:
|
||||
index_prefix_path: str, output_dir: str | None = None, build_type: str = "release", **kwargs
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Convenience function to partition a graph index.
|
||||
|
||||
@@ -276,12 +272,9 @@ if __name__ == "__main__":
|
||||
# Example: partition an index
|
||||
try:
|
||||
disk_graph_path, partition_bin_path = partition_graph(
|
||||
"/path/to/your/index_prefix",
|
||||
gp_times=10,
|
||||
lock_nums=10,
|
||||
cut=100
|
||||
"/path/to/your/index_prefix", gp_times=10, lock_nums=10, cut=100
|
||||
)
|
||||
print(f"Partitioning completed successfully!")
|
||||
print("Partitioning completed successfully!")
|
||||
print(f"Disk graph index: {disk_graph_path}")
|
||||
print(f"Partition binary: {partition_bin_path}")
|
||||
except Exception as e:
|
||||
|
||||
@@ -9,16 +9,12 @@ that directly calls the existing executables.
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
def partition_graph_simple(
|
||||
index_prefix_path: str,
|
||||
output_dir: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> Tuple[str, str]:
|
||||
index_prefix_path: str, output_dir: str | None = None, **kwargs
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Simple function to partition a graph index.
|
||||
|
||||
@@ -38,7 +34,7 @@ def partition_graph_simple(
|
||||
"scale_factor": 1,
|
||||
"data_type": "float",
|
||||
"thread_nums": 10,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Determine output directory
|
||||
@@ -67,38 +63,34 @@ def partition_graph_simple(
|
||||
|
||||
# Set up paths for temporary files
|
||||
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
|
||||
graph_gp_path = graph_path / f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
graph_gp_path = (
|
||||
graph_path
|
||||
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
)
|
||||
graph_gp_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run the build script with our parameters
|
||||
cmd = [
|
||||
str(graph_partition_dir / "build.sh"),
|
||||
"release",
|
||||
"split_graph",
|
||||
index_prefix_path
|
||||
]
|
||||
cmd = [str(graph_partition_dir / "build.sh"), "release", "split_graph", index_prefix_path]
|
||||
|
||||
# Set environment variables for parameters
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"GP_TIMES": str(params["gp_times"]),
|
||||
"GP_LOCK_NUMS": str(params["lock_nums"]),
|
||||
"GP_CUT": str(params["cut"]),
|
||||
"GP_SCALE_F": str(params["scale_factor"]),
|
||||
"DATA_TYPE": params["data_type"],
|
||||
"GP_T": str(params["thread_nums"]),
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
"GP_TIMES": str(params["gp_times"]),
|
||||
"GP_LOCK_NUMS": str(params["lock_nums"]),
|
||||
"GP_CUT": str(params["cut"]),
|
||||
"GP_SCALE_F": str(params["scale_factor"]),
|
||||
"DATA_TYPE": params["data_type"],
|
||||
"GP_T": str(params["thread_nums"]),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Running graph partition with command: {' '.join(cmd)}")
|
||||
print(f"Working directory: {graph_partition_dir}")
|
||||
|
||||
# Run the command
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=graph_partition_dir
|
||||
cmd, env=env, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -121,7 +113,7 @@ def partition_graph_simple(
|
||||
if not partition_bin_path.exists():
|
||||
raise RuntimeError(f"Expected output file not found: {partition_bin_path}")
|
||||
|
||||
print(f"✅ Partitioning completed successfully!")
|
||||
print("✅ Partitioning completed successfully!")
|
||||
print(f" Disk graph index: {disk_graph_path}")
|
||||
print(f" Partition binary: {partition_bin_path}")
|
||||
|
||||
@@ -135,9 +127,9 @@ if __name__ == "__main__":
|
||||
"/Users/yichuan/Desktop/release2/leann/diskannbuild/test_doc_files",
|
||||
gp_times=5,
|
||||
lock_nums=5,
|
||||
cut=50
|
||||
cut=50,
|
||||
)
|
||||
print(f"Success! Output files:")
|
||||
print("Success! Output files:")
|
||||
print(f" - {disk_graph_path}")
|
||||
print(f" - {partition_bin_path}")
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user