LEANN/research/paper_plot/cache_degree_data.py

import numpy as np
import os

# --- Configuration for Data Paths and Labels (Mirrors plotting script for consistency) ---
BIG_GRAPH_PATHS = [
    "/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/",
    "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/99_4_degree_based_hnsw_IP_M32_efC256/",
    "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/d9_hnsw_IP_M8_efC128/",
    "/opt/dlami/nvme/scaling_out/embeddings/facebook/contriever-msmarco/rpj_wiki/1-shards/indices/half_edges_IP_M32_efC128/"
]
STATS_FILE_NAME = "degree_distribution.txt"
BIG_GRAPH_LABELS = [  # These will be used as keys in the cached file
    "HNSW-Base",
    "DegreeGuide",
    "HNSW-D9",
    "RandCut",
]
# Average degrees are static and can be directly used in the plotting script or also cached.
# For simplicity here, we'll focus on caching the dynamic degree arrays.
# BIG_GRAPH_AVG_DEG = [18, 9, 9, 9]

# --- Cache File Configuration ---
DATA_CACHE_DIR = "./paper_plot/data/"
CACHE_FILE_NAME = "big_graph_degree_data.npz" # Using .npz for multiple arrays

def create_degree_data_cache():
    """
    Reads degree distribution data from specified text files and saves it
    into a compressed NumPy (.npz) cache file.
    """
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    cache_file_path = os.path.join(DATA_CACHE_DIR, CACHE_FILE_NAME)

    cached_data = {}
    print(f"Starting data caching process for {len(BIG_GRAPH_PATHS)} graph types...")

    for i, base_path in enumerate(BIG_GRAPH_PATHS):
        method_label = BIG_GRAPH_LABELS[i]
        degree_file_path = os.path.join(base_path, STATS_FILE_NAME)

        print(f"Processing: {method_label} from {degree_file_path}")

        try:
            # Load degrees as integers
            degrees = np.loadtxt(degree_file_path, dtype=int)

            if degrees.size == 0:
                print(f"  [WARN] Degree file is empty: {degree_file_path}. Storing as empty array for {method_label}.")
                # Store an empty array or handle as needed. For npz, an empty array is fine.
                cached_data[method_label] = np.array([], dtype=int)
            else:
                # Store the loaded degrees array with the method label as the key
                cached_data[method_label] = degrees
                print(f"  [INFO] Loaded {len(degrees)} degrees for {method_label}. Max degree: {np.max(degrees) if degrees.size > 0 else 'N/A'}")

        except FileNotFoundError:
            print(f"  [ERROR] Degree file not found: {degree_file_path}. Skipping {method_label}.")
            # Optionally store a placeholder or skip. For robustness, store None or an empty array.
            # Storing None might require special handling when loading. Empty array is safer for np.load.
            cached_data[method_label] = np.array([], dtype=int) # Store empty array if file not found
        except Exception as e:
            print(f"  [ERROR] An error occurred loading {degree_file_path} for {method_label}: {e}")
            cached_data[method_label] = np.array([], dtype=int) # Store empty array on other errors

    if not cached_data:
        print("[ERROR] No data was successfully processed or loaded. Cache file will not be created.")
        return

    try:
        # Save all collected degree arrays into a single .npz file.
        # Using savez_compressed for potentially smaller file size.
        np.savez_compressed(cache_file_path, **cached_data)
        print(f"\n[SUCCESS] Degree distribution data successfully cached to: {os.path.abspath(cache_file_path)}")
        print("Cached arrays (keys):", list(cached_data.keys()))
    except Exception as e:
        print(f"\n[ERROR] Failed to save data to cache file {cache_file_path}: {e}")

if __name__ == "__main__":
    print("--- Degree Distribution Data Caching Script ---")
    create_degree_data_cache()
    print("--- Caching script finished. ---")