309 lines
16 KiB
Python
309 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding:utf-8 -*-
|
|
# Motto: Were It to Benefit My Country, I Would Lay Down My Life!
|
|
# \file: /hnsw_degree_visit_plot_binned_academic.py
|
|
# \brief: Generates a binned bar plot of HNSW node average per-query visit probability
|
|
# per degree bin, styled for academic publications, with caching.
|
|
# Author: raphael hao (Original script by user, styling and caching adapted by Gemini)
|
|
|
|
# %%
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import re
|
|
from collections import Counter
|
|
import os # For robust filepath manipulation
|
|
import math # For calculating scaling factor
|
|
import pickle # For caching data
|
|
|
|
# %%
|
|
# --- Matplotlib parameters for academic paper style (from reference) ---
|
|
plt.rcParams["font.family"] = "Helvetica"
|
|
plt.rcParams["ytick.direction"] = "in"
|
|
plt.rcParams["hatch.linewidth"] = 1.5
|
|
plt.rcParams["font.weight"] = "bold"
|
|
plt.rcParams["axes.labelweight"] = "bold"
|
|
plt.rcParams["text.usetex"] = True # Use LaTeX for text rendering (if available)
|
|
|
|
# --- Define styles from reference ---
|
|
edgecolors_ref = ["dimgrey", "#63B8B6", "tomato", "silver", "slategray"]
|
|
|
|
# %%
|
|
# --- File Paths ---
|
|
degree_file = '/opt/dlami/nvme/scaling_out/indices/rpj_wiki/facebook/contriever-msmarco/hnsw/degree_distribution.txt'
|
|
visit_log_file = './re.log'
|
|
output_image_file = './paper_plot/figures/hnsw_visit_count_per_degree_corrected.pdf'
|
|
# --- CACHE FILE PATH: Keep this consistent ---
|
|
CACHE_FILE_PATH = './binned_plot_data_cache.pkl'
|
|
|
|
# --- Configuration ---
|
|
# Set to True to bypass cache and force recomputation.
|
|
# Otherwise, delete CACHE_FILE_PATH manually to force recomputation.
|
|
FORCE_RECOMPUTE = False
|
|
NUMBER_OF_QUERIES = 1000.0 # Number of queries the visit_counts are based on
|
|
|
|
# Create directory for figures if it doesn't exist
|
|
output_dir = os.path.dirname(output_image_file)
|
|
if output_dir and not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
print(f"Created directory: {output_dir}")
|
|
|
|
# %%
|
|
# --- Attempt to load data from cache or compute ---
|
|
df_plot_data = None
|
|
bin_size_for_plot = None # Will hold the bin_size associated with df_plot_data
|
|
|
|
if not FORCE_RECOMPUTE and os.path.exists(CACHE_FILE_PATH):
|
|
try:
|
|
with open(CACHE_FILE_PATH, 'rb') as f:
|
|
cache_content = pickle.load(f)
|
|
df_plot_data = cache_content['data']
|
|
bin_size_for_plot = cache_content['bin_size']
|
|
# Basic validation of cached data
|
|
# Expecting 'average_visit_count_per_node_in_bin' (raw average over NUMBER_OF_QUERIES)
|
|
if not isinstance(df_plot_data, pd.DataFrame) or \
|
|
'degree_bin_label' not in df_plot_data.columns or \
|
|
'average_visit_count_per_node_in_bin' not in df_plot_data.columns or \
|
|
not isinstance(bin_size_for_plot, int):
|
|
print("Cached data is not in the expected format or missing 'average_visit_count_per_node_in_bin'. Recomputing.")
|
|
df_plot_data = None # Invalidate to trigger recomputation
|
|
else:
|
|
print(f"Successfully loaded binned data from cache: {CACHE_FILE_PATH}")
|
|
|
|
# --- Modify the label loaded from cache for display purpose ---
|
|
# This modification only happens when data is loaded from cache and meets specific conditions.
|
|
# Assumption: If the bin_size_for_plot in cache is 5,
|
|
# then the original label "0-4" actually represents nodes with degree 1-4 (because you guarantee no 0-degree nodes).
|
|
if df_plot_data is not None and 'degree_bin_label' in df_plot_data.columns and bin_size_for_plot == 5:
|
|
# Check if "0-4" label exists
|
|
if '0-4' in df_plot_data['degree_bin_label'].values:
|
|
# Use .loc to ensure the modification is on the original DataFrame
|
|
df_plot_data.loc[df_plot_data['degree_bin_label'] == '0-4', 'degree_bin_label'] = '1-4'
|
|
print("Modified degree_bin_label from '0-4' to '1-4' for display purpose.")
|
|
except Exception as e:
|
|
print(f"Error loading from cache: {e}. Recomputing.")
|
|
df_plot_data = None # Invalidate to trigger recomputation
|
|
|
|
if df_plot_data is None:
|
|
print("Cache not found, invalid, or recompute forced. Computing data from scratch...")
|
|
# --- 1. Read Degree Distribution File ---
|
|
degrees_data = []
|
|
try:
|
|
with open(degree_file, 'r') as f:
|
|
for i, line in enumerate(f):
|
|
line_stripped = line.strip()
|
|
if line_stripped:
|
|
degrees_data.append({'node_id': i, 'degree': int(line_stripped)})
|
|
except FileNotFoundError:
|
|
print(f"Error: Degree file '{degree_file}' not found. Using dummy data for degrees.")
|
|
degrees_data = [{'node_id': i, 'degree': (i % 20) + 1 } for i in range(200)]
|
|
degrees_data.extend([{'node_id': 200+i, 'degree': i} for i in range(58, 67)]) # For 60-64 bin
|
|
degrees_data.extend([{'node_id': 300+i, 'degree': (i % 5)+1} for i in range(10)]) # Low degrees
|
|
degrees_data.extend([{'node_id': 400+i, 'degree': 80 + (i%5)} for i in range(10)]) # High degrees
|
|
|
|
|
|
if not degrees_data:
|
|
print(f"Critical Error: No data loaded or generated for degrees. Exiting.")
|
|
exit()
|
|
df_degrees = pd.DataFrame(degrees_data)
|
|
print(f"Successfully loaded/generated {len(df_degrees)} degree entries.")
|
|
|
|
# --- 2. Read Visit Log File and Count Frequencies ---
|
|
visit_counts = Counter()
|
|
node_id_pattern = re.compile(r"Vis(i)?ted node: (\d+)")
|
|
try:
|
|
with open(visit_log_file, 'r') as f_log:
|
|
for line_num, line in enumerate(f_log, 1):
|
|
match = node_id_pattern.search(line)
|
|
if match:
|
|
try:
|
|
node_id = int(match.group(2))
|
|
visit_counts[node_id] += 1 # Increment visit count for the node
|
|
except ValueError:
|
|
print(f"Warning: Non-integer node_id in log '{visit_log_file}' line {line_num}: {line.strip()}")
|
|
except FileNotFoundError:
|
|
print(f"Warning: Visit log file '{visit_log_file}' not found. Using dummy visit counts.")
|
|
if not df_degrees.empty:
|
|
for node_id_val in df_degrees['node_id'].sample(frac=0.9, random_state=1234): # Seed for reproducibility
|
|
degree_val = df_degrees[df_degrees['node_id'] == node_id_val]['degree'].iloc[0]
|
|
# Generate visit counts to test different probability magnitudes
|
|
if node_id_val % 23 == 0: # Very low probability
|
|
lambda_val = 0.0005 * (100 / (max(1,degree_val) + 1)) # avg visits over 1k queries
|
|
elif node_id_val % 11 == 0: # Low probability
|
|
lambda_val = 0.05 * (100 / (max(1,degree_val) + 1))
|
|
elif node_id_val % 5 == 0: # Moderate probability
|
|
lambda_val = 2.5 * (100 / (max(1,degree_val) + 1))
|
|
else: # Higher probability (but still < 1000 visits for a single node usually)
|
|
lambda_val = 50 * (100 / (max(1,degree_val) + 1))
|
|
visit_counts[node_id_val] = np.random.poisson(lambda_val)
|
|
if visit_counts[node_id_val] < 0: visit_counts[node_id_val] = 0
|
|
|
|
if not visit_counts:
|
|
print(f"Warning: No visit data parsed/generated. Plot may show zero visits.")
|
|
df_visits = pd.DataFrame(columns=['node_id', 'visit_count'])
|
|
else:
|
|
df_visits_list = [{'node_id': nid, 'visit_count': count} for nid, count in visit_counts.items()]
|
|
df_visits = pd.DataFrame(df_visits_list)
|
|
print(f"Parsed/generated {len(df_visits)} unique visited nodes, totaling {sum(visit_counts.values())} visits (simulated over {NUMBER_OF_QUERIES} queries).")
|
|
|
|
# --- 3. Merge Degree Data with Visit Data ---
|
|
df_merged = pd.merge(df_degrees, df_visits, on='node_id', how='left')
|
|
df_merged['visit_count'] = df_merged['visit_count'].fillna(0).astype(float) # visit_count is total over NUMBER_OF_QUERIES
|
|
print(f"Merged data contains {len(df_merged)} entries.")
|
|
|
|
# --- 5. Binning Degrees and Calculating Average Visit Count per Node in Bin (over NUMBER_OF_QUERIES) ---
|
|
current_bin_size = 5
|
|
bin_size_for_plot = current_bin_size
|
|
|
|
if not df_degrees.empty:
|
|
print(f"\nBinning degrees into groups of {current_bin_size} for average visit count calculation...")
|
|
|
|
df_merged_with_bins = df_merged.copy()
|
|
df_merged_with_bins['degree_bin_start'] = (df_merged_with_bins['degree'] // current_bin_size) * current_bin_size
|
|
|
|
df_binned_analysis = df_merged_with_bins.groupby('degree_bin_start').agg(
|
|
total_visit_count_in_bin=('visit_count', 'sum'),
|
|
node_count_in_bin=('node_id', 'nunique')
|
|
).reset_index()
|
|
|
|
# This is the average number of times a node in this bin was visited over NUMBER_OF_QUERIES queries.
|
|
# This value is what gets cached.
|
|
df_binned_analysis['average_visit_count_per_node_in_bin'] = 0.0
|
|
df_binned_analysis.loc[df_binned_analysis['node_count_in_bin'] > 0, 'average_visit_count_per_node_in_bin'] = \
|
|
df_binned_analysis['total_visit_count_in_bin'] / df_binned_analysis['node_count_in_bin']
|
|
|
|
df_binned_analysis['degree_bin_label'] = df_binned_analysis['degree_bin_start'].astype(str) + '-' + \
|
|
(df_binned_analysis['degree_bin_start'] + current_bin_size - 1).astype(str)
|
|
|
|
bin_to_drop_label = '60-64'
|
|
original_length = len(df_binned_analysis)
|
|
df_plot_data_intermediate = df_binned_analysis[df_binned_analysis['degree_bin_label'] != bin_to_drop_label].copy()
|
|
if len(df_plot_data_intermediate) < original_length:
|
|
print(f"\nManually dropped the bin: '{bin_to_drop_label}'")
|
|
else:
|
|
print(f"\nNote: Bin '{bin_to_drop_label}' not found for dropping or already removed.")
|
|
|
|
df_plot_data = df_plot_data_intermediate
|
|
|
|
print(f"\nBinned data (average visit count per node in bin over {NUMBER_OF_QUERIES} queries) for plotting prepared:")
|
|
print(df_plot_data[['degree_bin_label', 'average_visit_count_per_node_in_bin']].head())
|
|
|
|
if df_plot_data is not None and not df_plot_data.empty:
|
|
try:
|
|
with open(CACHE_FILE_PATH, 'wb') as f:
|
|
pickle.dump({'data': df_plot_data, 'bin_size': bin_size_for_plot}, f)
|
|
print(f"Saved computed binned data to cache: {CACHE_FILE_PATH}")
|
|
except Exception as e:
|
|
print(f"Error saving data to cache: {e}")
|
|
elif df_plot_data is None or df_plot_data.empty:
|
|
print("Computed data for binned plot is empty, not saving to cache.")
|
|
else:
|
|
print("Degree data (df_degrees) is empty. Cannot perform binning.")
|
|
df_plot_data = pd.DataFrame()
|
|
bin_size_for_plot = current_bin_size
|
|
|
|
# %%
|
|
# --- 6. Plotting (Binned Bar Chart - Academic Style) ---
|
|
|
|
if df_plot_data is not None and not df_plot_data.empty and 'average_visit_count_per_node_in_bin' in df_plot_data.columns:
|
|
base_name, ext = os.path.splitext(output_image_file)
|
|
# --- OUTPUT PDF FILE NAME: Keep this consistent ---
|
|
binned_output_image_file = base_name + ext
|
|
|
|
fig, ax = plt.subplots(figsize=(6, 2.5)) # Adjusted figure size
|
|
|
|
df_plot_data_plotting = df_plot_data.copy()
|
|
# Calculate per-query probability: (avg visits over N queries) / N
|
|
df_plot_data_plotting['per_query_visit_probability'] = \
|
|
df_plot_data_plotting['average_visit_count_per_node_in_bin'] / NUMBER_OF_QUERIES
|
|
|
|
max_probability = df_plot_data_plotting['per_query_visit_probability'].max()
|
|
|
|
y_axis_values_to_plot = df_plot_data_plotting['per_query_visit_probability']
|
|
y_axis_label = r"Per-Query Node Visit Probability in Bin" # Base label
|
|
|
|
apply_scaling_to_label_and_values = False # Initialize flag
|
|
exponent_for_label_display = 0 # Initialize exponent
|
|
|
|
if pd.notna(max_probability) and max_probability > 0:
|
|
potential_exponent = math.floor(math.log10(max_probability))
|
|
|
|
if potential_exponent <= -4 or potential_exponent >= 0:
|
|
apply_scaling_to_label_and_values = True
|
|
exponent_for_label_display = potential_exponent
|
|
# No specific adjustment for potential_exponent >=0 here, it's handled by the general logic.
|
|
|
|
if apply_scaling_to_label_and_values:
|
|
y_axis_label = rf"Visit Probability ($\times 10^{{{exponent_for_label_display}}}$)"
|
|
y_axis_values_to_plot = df_plot_data_plotting['per_query_visit_probability'] / (10**exponent_for_label_display)
|
|
print(f"Plotting with Max per-query probability: {max_probability:.2e}, Exponent for label: {exponent_for_label_display}. Y-axis values scaled for plot.")
|
|
else:
|
|
print(f"Plotting with Max per-query probability: {max_probability:.2e}. Plotting direct probabilities without label scaling (exponent {potential_exponent} is within no-scale range [-3, -1]).")
|
|
|
|
elif pd.notna(max_probability) and max_probability == 0:
|
|
print("Max per-query probability is 0. Plotting direct probabilities (all zeros).")
|
|
else:
|
|
print(f"Max per-query probability is NaN or invalid ({max_probability}). Plotting direct probabilities without scaling if possible.")
|
|
|
|
ax.bar(
|
|
df_plot_data_plotting['degree_bin_label'],
|
|
y_axis_values_to_plot,
|
|
color='white',
|
|
edgecolor=edgecolors_ref[0],
|
|
linewidth=1.5,
|
|
width=0.8
|
|
)
|
|
|
|
ax.set_xlabel('Node Degree', fontsize=10.5, labelpad=6)
|
|
# MODIFIED LINE: Added labelpad to move the y-axis label to the left
|
|
ax.set_ylabel(y_axis_label, fontsize=10.5, labelpad=10)
|
|
|
|
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: f"{x:.0f}%"))
|
|
|
|
num_bins = len(df_plot_data_plotting)
|
|
if num_bins > 12:
|
|
ax.set_xticks(ax.get_xticks())
|
|
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=9)
|
|
elif num_bins > 8:
|
|
ax.tick_params(axis='x', labelsize=9)
|
|
else:
|
|
ax.tick_params(axis='x', labelsize=10)
|
|
|
|
ax.tick_params(axis='y', labelsize=10)
|
|
|
|
padding_factor = 0.05
|
|
current_max_y_on_axis = y_axis_values_to_plot.max()
|
|
|
|
upper_y_limit = 0.1 # Default small upper limit
|
|
if pd.notna(current_max_y_on_axis):
|
|
if current_max_y_on_axis > 0:
|
|
# Adjust minimum visible range based on whether scaling was applied and the exponent
|
|
min_meaningful_limit = 0.01
|
|
if apply_scaling_to_label_and_values and exponent_for_label_display >= 0 : # Numbers on axis are smaller due to positive exponent scaling
|
|
min_meaningful_limit = 0.1 # If original numbers were e.g. 2500 (2.5 x 10^3), scaled axis is 2.5, 0.1 is fine
|
|
elif not apply_scaling_to_label_and_values and pd.notna(max_probability) and max_probability >=1: # Direct large probabilities
|
|
min_meaningful_limit = 1 # If max prob is 2.5 (250%), axis value 2.5, needs larger base limit
|
|
|
|
upper_y_limit = max(min_meaningful_limit, current_max_y_on_axis * (1 + padding_factor))
|
|
|
|
else: # current_max_y_on_axis is 0
|
|
upper_y_limit = 0.1
|
|
ax.set_ylim(0, upper_y_limit)
|
|
else:
|
|
ax.set_ylim(0, 1.0) # Default for empty or NaN data
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(binned_output_image_file, bbox_inches="tight", dpi=300)
|
|
print(f"Binned bar chart saved to {binned_output_image_file}")
|
|
plt.show()
|
|
plt.close(fig)
|
|
else:
|
|
if df_plot_data is None:
|
|
print("Data for plotting (df_plot_data) is None. Skipping plot generation.")
|
|
elif df_plot_data.empty:
|
|
print("Data for plotting (df_plot_data) is empty. Skipping plot generation.")
|
|
elif 'average_visit_count_per_node_in_bin' not in df_plot_data.columns:
|
|
print("Essential column 'average_visit_count_per_node_in_bin' is missing in df_plot_data. Skipping plot generation.")
|
|
|
|
# %%
|
|
print("Script finished.") |