Initial commit
This commit is contained in:
81
research/paper_plot/cache_degree_data.py
Normal file
81
research/paper_plot/cache_degree_data.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# --- Configuration for Data Paths and Labels (Mirrors plotting script for consistency) ---
|
||||
BIG_GRAPH_PATHS = [
|
||||
"/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/",
|
||||
"/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/99_4_degree_based_hnsw_IP_M32_efC256/",
|
||||
"/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/d9_hnsw_IP_M8_efC128/",
|
||||
"/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/half_edges_IP_M32_efC128/"
|
||||
]
|
||||
STATS_FILE_NAME = "degree_distribution.txt"
|
||||
BIG_GRAPH_LABELS = [ # These will be used as keys in the cached file
|
||||
"HNSW-Base",
|
||||
"DegreeGuide",
|
||||
"HNSW-D9",
|
||||
"RandCut",
|
||||
]
|
||||
# Average degrees are static and can be directly used in the plotting script or also cached.
|
||||
# For simplicity here, we'll focus on caching the dynamic degree arrays.
|
||||
# BIG_GRAPH_AVG_DEG = [18, 9, 9, 9]
|
||||
|
||||
# --- Cache File Configuration ---
|
||||
DATA_CACHE_DIR = "./paper_plot/data/"
|
||||
CACHE_FILE_NAME = "big_graph_degree_data.npz" # Using .npz for multiple arrays
|
||||
|
||||
def create_degree_data_cache():
|
||||
"""
|
||||
Reads degree distribution data from specified text files and saves it
|
||||
into a compressed NumPy (.npz) cache file.
|
||||
"""
|
||||
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
||||
cache_file_path = os.path.join(DATA_CACHE_DIR, CACHE_FILE_NAME)
|
||||
|
||||
cached_data = {}
|
||||
print(f"Starting data caching process for {len(BIG_GRAPH_PATHS)} graph types...")
|
||||
|
||||
for i, base_path in enumerate(BIG_GRAPH_PATHS):
|
||||
method_label = BIG_GRAPH_LABELS[i]
|
||||
degree_file_path = os.path.join(base_path, STATS_FILE_NAME)
|
||||
|
||||
print(f"Processing: {method_label} from {degree_file_path}")
|
||||
|
||||
try:
|
||||
# Load degrees as integers
|
||||
degrees = np.loadtxt(degree_file_path, dtype=int)
|
||||
|
||||
if degrees.size == 0:
|
||||
print(f" [WARN] Degree file is empty: {degree_file_path}. Storing as empty array for {method_label}.")
|
||||
# Store an empty array or handle as needed. For npz, an empty array is fine.
|
||||
cached_data[method_label] = np.array([], dtype=int)
|
||||
else:
|
||||
# Store the loaded degrees array with the method label as the key
|
||||
cached_data[method_label] = degrees
|
||||
print(f" [INFO] Loaded {len(degrees)} degrees for {method_label}. Max degree: {np.max(degrees) if degrees.size > 0 else 'N/A'}")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f" [ERROR] Degree file not found: {degree_file_path}. Skipping {method_label}.")
|
||||
# Optionally store a placeholder or skip. For robustness, store None or an empty array.
|
||||
# Storing None might require special handling when loading. Empty array is safer for np.load.
|
||||
cached_data[method_label] = np.array([], dtype=int) # Store empty array if file not found
|
||||
except Exception as e:
|
||||
print(f" [ERROR] An error occurred loading {degree_file_path} for {method_label}: {e}")
|
||||
cached_data[method_label] = np.array([], dtype=int) # Store empty array on other errors
|
||||
|
||||
if not cached_data:
|
||||
print("[ERROR] No data was successfully processed or loaded. Cache file will not be created.")
|
||||
return
|
||||
|
||||
try:
|
||||
# Save all collected degree arrays into a single .npz file.
|
||||
# Using savez_compressed for potentially smaller file size.
|
||||
np.savez_compressed(cache_file_path, **cached_data)
|
||||
print(f"\n[SUCCESS] Degree distribution data successfully cached to: {os.path.abspath(cache_file_path)}")
|
||||
print("Cached arrays (keys):", list(cached_data.keys()))
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] Failed to save data to cache file {cache_file_path}: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("--- Degree Distribution Data Caching Script ---")
|
||||
create_degree_data_cache()
|
||||
print("--- Caching script finished. ---")
|
||||
Reference in New Issue
Block a user