Initial commit

This commit is contained in:
yichuan520030910320
2025-06-30 09:05:05 +00:00
commit 46f6cc100b
1231 changed files with 278432 additions and 0 deletions

View File

@@ -0,0 +1,441 @@
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as mlines
import pandas as pd
import numpy as np
from matplotlib.patches import FancyArrowPatch
sns.set_theme(style="ticks", font_scale=1.2)
plt.rcParams['axes.grid'] = True
plt.rcParams['axes.grid.which'] = 'major'
plt.rcParams['grid.linestyle'] = '--'
plt.rcParams['grid.color'] = 'gray'
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['xtick.minor.visible'] = False
plt.rcParams['ytick.minor.visible'] = False
plt.rcParams["font.family"] = "Helvetica"
plt.rcParams["text.usetex"] = True
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"
# Generation(LLama 1B) Generation(LLama 3B) Generation(LLama 7B)
# 0.085s 0.217s 0.472s
# llm_inference_time=[0.085, 0.217, 0.472, 0] # Will be replaced by CSV data
# llm_inference_time_for_mac = [0.316, 0.717, 1.468, 0] # Will be replaced by CSV data
def parse_latency_data(csv_path):
df = pd.read_csv(csv_path)
latency_data = {}
llm_gen_times = {} # To store LLM generation times: (dataset, hardware) -> time
for _, row in df.iterrows():
dataset = row['Dataset']
hardware = row['Hardware']
recall_target_str = row['Recall_target'].replace('%', '')
try:
recall_target = float(recall_target_str)
except ValueError:
print(f"Warning: Could not parse recall_target '{row['Recall_target']}'. Skipping row.")
continue
if (dataset, hardware) not in llm_gen_times: # Read once per (dataset, hardware)
llm_time_val = pd.to_numeric(row.get('LLM_Gen_Time_1B'), errors='coerce')
if not pd.isna(llm_time_val):
llm_gen_times[(dataset, hardware)] = llm_time_val
else:
llm_gen_times[(dataset, hardware)] = np.nan # Store NaN if unparsable/missing
cols_to_skip = ['Dataset', 'Hardware', 'Recall_target',
'LLM_Gen_Time_1B', 'LLM_Gen_Time_3B', 'LLM_Gen_Time_7B']
for col in df.columns:
if col not in cols_to_skip:
method_name = col
key = (dataset, hardware, method_name)
if key not in latency_data:
latency_data[key] = []
try:
latency_value = float(row[method_name])
latency_data[key].append((recall_target, latency_value))
except ValueError:
# Handle cases where latency might be non-numeric (e.g., 'N/A' or empty)
print(f"Warning: Could not parse latency for {method_name} at {dataset}/{hardware}/Recall {recall_target} ('{row[method_name]}'). Skipping this point.")
latency_data[key].append((recall_target, np.nan)) # Or skip appending
# Sort by recall for consistent plotting
for key in latency_data:
latency_data[key].sort(key=lambda x: x[0])
return latency_data, llm_gen_times
def parse_storage_data(csv_path):
df = pd.read_csv(csv_path)
storage_data = {}
# Assuming the first column is 'MetricType' (RAM/Storage) and subsequent columns are methods
# And the header row is like: MetricType, Method1, Method2, ...
# Transpose to make methods as rows for easier lookup might be an option,
# but let's try direct parsing.
# Find the row for RAM and Storage
ram_row = df[df.iloc[:, 0] == 'RAM'].iloc[0]
storage_row = df[df.iloc[:, 0] == 'Storage'].iloc[0]
methods = df.columns[1:] # First column is the metric type label
for method in methods:
storage_data[method] = {
'RAM': pd.to_numeric(ram_row[method], errors='coerce'),
'Storage': pd.to_numeric(storage_row[method], errors='coerce')
}
return storage_data
# Load data
latency_csv_path = 'paper_plot/data/main_latency.csv'
storage_csv_path = 'paper_plot/data/ram_storage.csv'
latency_data, llm_generation_times = parse_latency_data(latency_csv_path)
storage_info = parse_storage_data(storage_csv_path)
# --- Determine unique Datasets and Hardware combinations to plot for ---
unique_dataset_hardware_configs = sorted(list(set((d, h) for d, h, m in latency_data.keys())))
if not unique_dataset_hardware_configs:
print("Error: No (Dataset, Hardware) combinations found in latency data. Check CSV paths and content.")
exit()
# --- Define constants for plotting ---
all_method_names = sorted(list(set(m for d,h,m in latency_data.keys())))
if not all_method_names:
# Fallback if latency_data is empty but storage_info might have method names
all_method_names = sorted(list(storage_info.keys()))
if not all_method_names:
print("Error: No method names found in data. Cannot proceed with plotting.")
exit()
method_markers = {
'HNSW': 'o',
'IVF': 'X',
'DiskANN': 's',
'IVF-Disk': 'P',
'IVF-Recompute': '^',
'Our': '*',
'BM25': "v"
# Add more if necessary, or make it dynamic
}
method_display_names = {
'IVF-Recompute': 'IVF-Recompute (EdgeRAG)',
# 其他方法保持原名
}
# Ensure all methods have a marker
default_markers = ['^', 'v', '<', '>', 'H', 'h', '+', 'x', '|', '_']
next_default_marker = 0
for mn in all_method_names:
if mn not in method_markers:
print(f"mn: {mn}")
method_markers[mn] = default_markers[next_default_marker % len(default_markers)]
next_default_marker +=1
recall_levels_present = sorted(list(set(r for key in latency_data for r, l in latency_data[key])))
# Define colors for up to a few common recall levels, add more if needed
base_recall_colors = {
85.0: "#1f77b4", # Blue
90.0: "#ff7f0e", # Orange
95.0: "#2ca02c", # Green
# Add more if other recall % values exist
}
recall_colors = {}
color_palette = sns.color_palette("viridis", n_colors=len(recall_levels_present))
for idx, r_level in enumerate(recall_levels_present):
recall_colors[r_level] = base_recall_colors.get(r_level, color_palette[idx % len(color_palette)])
# --- Determine global x (latency) and y (storage) limits for consistent axes ---
all_latency_values = []
all_storage_values = []
raw_data_size = 76 # Raw data size in GB
for ds_hw_key in unique_dataset_hardware_configs:
current_ds, current_hw = ds_hw_key
for method_name in all_method_names:
# Get storage for this method
disk_storage = storage_info.get(method_name, {}).get('Storage', np.nan)
if not np.isnan(disk_storage):
all_storage_values.append(disk_storage)
# Get latencies for this method under current_ds, current_hw
latency_key = (current_ds, current_hw, method_name)
if latency_key in latency_data:
for recall, latency in latency_data[latency_key]:
if not np.isnan(latency):
all_latency_values.append(latency)
# Add padding to limits
min_lat = min(all_latency_values) if all_latency_values else 0.001
max_lat = max(all_latency_values) if all_latency_values else 1
min_store = min(all_storage_values) if all_storage_values else 0
max_store = max(all_storage_values) if all_storage_values else 1
# Convert storage values to proportion of raw data
min_store_proportion = min_store / raw_data_size if all_storage_values else 0
max_store_proportion = max_store / raw_data_size if all_storage_values else 0.1
# Padding for log scale latency - adjust minimum to be more reasonable
lat_log_min = -1 # Changed from -2 to -1 to set minimum to 10^-1 (0.1s)
lat_log_max = np.log10(max_lat) if max_lat > 0 else 3 # default to 1000 s
lat_padding = (lat_log_max - lat_log_min) * 0.05
global_xlim = [10**(lat_log_min - lat_padding), 10**(lat_log_max + lat_padding)]
if global_xlim[0] <= 0: global_xlim[0] = 0.1 # Changed from 0.01 to 0.1
# Padding for linear scale storage proportion
store_padding = (max_store_proportion - min_store_proportion) * 0.05
global_ylim = [max(0, min_store_proportion - store_padding), max_store_proportion + store_padding]
if global_ylim[0] >= global_ylim[1]: # Avoid inverted or zero range
global_ylim[1] = global_ylim[0] + 0.1
# After loading the data and before plotting, add this code to reorder the datasets
# Find where you define all_datasets (around line 95)
# Original code:
all_datasets = sorted(list(set(ds for ds, _ in unique_dataset_hardware_configs)))
# Replace with this to specify the exact order:
all_datasets_unsorted = list(set(ds for ds, _ in unique_dataset_hardware_configs))
desired_order = ['NQ', 'TriviaQA', 'GPQA','HotpotQA']
all_datasets = [ds for ds in desired_order if ds in all_datasets_unsorted]
# Add any datasets that might be in the data but not in our desired_order list
all_datasets.extend([ds for ds in all_datasets_unsorted if ds not in desired_order])
# Then the rest of your code remains the same:
a10_configs = [(ds, 'A10') for ds in all_datasets if (ds, 'A10') in unique_dataset_hardware_configs]
mac_configs = [(ds, 'MAC') for ds in all_datasets if (ds, 'MAC') in unique_dataset_hardware_configs]
# Create two figures - one for A10 and one for MAC
hardware_configs = [a10_configs, mac_configs]
hardware_names = ['A10', 'MAC']
for fig_idx, configs_for_this_figure in enumerate(hardware_configs):
if not configs_for_this_figure:
continue
num_cols_this_figure = len(configs_for_this_figure)
# 1 row, num_cols_this_figure columns
fig, axs = plt.subplots(1, num_cols_this_figure, figsize=(7 * num_cols_this_figure, 6), sharex=True, sharey=True, squeeze=False)
# fig.suptitle(f"Latency vs. Storage ({hardware_names[fig_idx]})", fontsize=18, y=0.98)
for subplot_idx, (current_ds, current_hw) in enumerate(configs_for_this_figure):
ax = axs[0, subplot_idx] # Accessing column in the first row
ax.set_title(f"{current_ds}", fontsize=25) # No need to show hardware in title since it's in suptitle
for method_name in all_method_names:
marker = method_markers.get(method_name, '+')
disk_storage = storage_info.get(method_name, {}).get('Storage', np.nan)
latency_points_key = (current_ds, current_hw, method_name)
if latency_points_key in latency_data:
points_for_method = latency_data[latency_points_key]
print(f"points_for_method: {points_for_method}")
for recall, latency in points_for_method:
# Only skip if latency is invalid (since we need log scale for x-axis)
# But allow zero storage since y-axis is now linear
if np.isnan(latency) or np.isnan(disk_storage) or latency <= 0:
continue
# Add LLM generation time from CSV
current_llm_add_time = llm_generation_times.get((current_ds, current_hw))
if current_llm_add_time is not None and not np.isnan(current_llm_add_time):
latency = latency + current_llm_add_time
else:
raise ValueError(f"No LLM generation time found for {current_ds} on {current_hw}")
# Special handling for BM25
if method_name == 'BM25':
# BM25 is only valid for 85% recall points (other points are 0)
if recall != 85.0:
continue
color = 'grey'
else:
# Use the color for target recall
color = recall_colors.get(recall, 'grey')
# Convert storage to proportion
disk_storage_proportion = disk_storage / raw_data_size
size = 80
x_offset = -50
if current_ds == 'GPQA':
x_offset = -32
# Apply a small vertical offset to IVF-Recompute points to make them more visible
if method_name == 'IVF-Recompute':
# Add a small vertical offset (adjust the 0.05 value as needed)
disk_storage_proportion += 0.07
size = 80
if method_name == 'DiskANN':
size = 50
if method_name == 'Our':
size = 140
disk_storage_proportion += 0.05
# Add "Pareto Frontier" label to Our method points
if recall == 95:
ax.annotate('Ours',
(latency, disk_storage_proportion),
xytext=(x_offset, 25), # Increased leftward offset from -65 to -120
textcoords='offset points',
fontsize=20,
color='red',
weight='bold',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="red", alpha=0.7))
# Increase size for BM25 points
if method_name == 'BM25':
size = 70
size*=5
ax.scatter(latency, disk_storage_proportion, marker=marker, color=color,
s=size, alpha=0.85, edgecolors='black', linewidths=0.7)
ax.set_xscale("log")
ax.set_yscale("linear") # CHANGED from log scale to linear scale for Y-axis
# Generate appropriate powers of 10 based on your data range
min_power = -1
max_power = 4
log_ticks = [10**i for i in range(min_power, max_power+1)]
# Set custom tick positions
ax.set_xticks(log_ticks)
# Create custom bold LaTeX labels with 10^n format
log_tick_labels = [fr'$\mathbf{{10^{{{i}}}}}$' for i in range(min_power, max_power+1)]
ax.set_xticklabels(log_tick_labels, fontsize=24)
# Apply global limits
if subplot_idx == 0:
ax.set_xlim(global_xlim)
ax.set_ylim(global_ylim)
ax.grid(True, which="major", linestyle="--", linewidth=0.6, alpha=0.7)
# Remove minor grid lines completely
ax.grid(False, which="minor")
# Remove ticks
# First set the shared parameters for both axes
ax.tick_params(axis='both', which='both', length=0, labelsize=24)
# Then set the padding only for the x-axis
ax.tick_params(axis='x', which='both', pad=10)
if subplot_idx == 0: # Y-label only for the leftmost subplot
ax.set_ylabel("Proportional Size", fontsize=24)
# X-label for all subplots in a 1xN layout can be okay, or just the middle/last one.
# Let's put it on all for now.
ax.set_xlabel("Latency (s)", fontsize=25)
# Display 100%, 200%, 300% for yaxis
ax.set_yticks([1, 2, 3])
ax.set_yticklabels(['100\%', '200\\%', '300\\%'])
# Create a custom arrow with "Better" text inside
# Create the arrow patch with a wider shaft
arrow = FancyArrowPatch(
(0.8, 0.8), # Start point (top-right)
(0.65, 0.6), # End point (toward bottom-left)
transform=ax.transAxes,
arrowstyle='simple,head_width=40,head_length=35,tail_width=20', # Increased arrow dimensions
facecolor='white',
edgecolor='black',
linewidth=3, # Thicker outline
zorder=5
)
# Add the arrow to the plot
ax.add_patch(arrow)
# Calculate the midpoint of the arrow for text placement
mid_x = (0.8 + 0.65) / 2 + 0.002 + 0.01
mid_y = (0.8 + 0.6) / 2 + 0.01
# Add the "Better" text at the midpoint of the arrow
ax.text(mid_x, mid_y, 'Better',
transform=ax.transAxes,
ha='center',
va='center',
fontsize=16, # Increased font size from 12 to 16
fontweight='bold',
rotation=40, # Rotate to match arrow direction
zorder=6) # Ensure text is on top of arrow
# Create legends (once per figure)
method_legend_handles = []
for method, marker_style in method_markers.items():
if method in all_method_names:
print(f"method: {method}")
# Use black color for BM25 in the legend
if method == 'BM25':
method_legend_handles.append(mlines.Line2D([], [], color='black', marker=marker_style, linestyle='None',
markersize=10, label=method))
else:
if method in method_display_names:
method = method_display_names[method]
method_legend_handles.append(mlines.Line2D([], [], color='black', marker=marker_style, linestyle='None',
markersize=10, label=method))
recall_legend_handles = []
sorted_recall_levels = sorted(recall_colors.keys())
for r_level in sorted_recall_levels:
recall_legend_handles.append(mlines.Line2D([], [], color=recall_colors[r_level], marker='o', linestyle='None',
markersize=20, label=f"Target Recall={r_level:.0f}\%"))
# 将图例分成两行:第一行是方法,第二行是召回率
if fig_idx == 0:
# 从方法列表中先排除'Our'
other_methods = [m for m in all_method_names if m != 'Our']
# 按照需要的顺序创建方法列表(将'Our'放在最后)
ordered_methods = other_methods + (['Our'] if 'Our' in all_method_names else [])
# 按照新顺序创建方法图例句柄
method_legend_handles = []
for method in ordered_methods:
if method in method_markers:
marker_style = method_markers[method]
# 使用显示名称映射
display_name = method_display_names.get(method, method)
color = 'black'
marker_size = 22
if method == 'Our':
marker_size = 27
elif 'IVF-Recompute' in method or 'EdgeRAG' in method:
marker_size = 17
elif 'DiskANN' in method:
marker_size = 19
elif 'BM25' in method:
marker_size = 20
method_legend_handles.append(mlines.Line2D([], [], color=color, marker=marker_style,
linestyle='None', markersize=marker_size, label=display_name))
# 创建召回率图例(第二行)- 注意位置调整,放在方法图例下方
recall_legend = fig.legend(handles=recall_legend_handles,
loc='upper center', bbox_to_anchor=(0.5, 1.05), # y坐标降低放在第一行下方
ncol=len(recall_legend_handles), fontsize=28)
# 创建方法图例(第一行)
method_legend = fig.legend(handles=method_legend_handles,
loc='upper center', bbox_to_anchor=(0.5, 0.91),
ncol=len(method_legend_handles), fontsize=28)
# 添加图例到渲染器
fig.add_artist(method_legend)
fig.add_artist(recall_legend)
# 调整布局,为顶部的两行图例留出更多空间
plt.tight_layout(rect=(0, 0, 1.0, 0.74)) # 顶部空间从0.9调整到0.85,给两行图例留出更多空间
save_path = f'./paper_plot/figures/main_exp_fig_{fig_idx+1}.pdf'
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Saved figure {fig_idx+1} to {save_path}")
plt.show()