Files
LEANN/benchmark_embeddings_simulated.py

122 lines
4.1 KiB
Python

import time
import matplotlib.pyplot as plt
import mlx.core as mx
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
# --- Configuration ---
MODEL_NAME_TORCH = "Qwen/Qwen3-Embedding-0.6B"
BATCH_SIZES = [1, 8, 16, 32, 64, 128, 256]
NUM_RUNS = 10
WARMUP_RUNS = 2
SEQ_LENGTH = 256
EMBED_DIM = 768 # Dimension for all-mpnet-base-v2
# --- Generate Dummy Data ---
DUMMY_SENTENCES = ["This is a test sentence for benchmarking." * 5] * max(BATCH_SIZES)
# --- PyTorch Benchmark Function ---
def benchmark_torch(model, sentences):
start_time = time.time()
model.encode(sentences, convert_to_numpy=True)
torch.mps.synchronize() # Ensure computation is finished on MPS
end_time = time.time()
return (end_time - start_time) * 1000 # Return time in ms
# --- Simulated MLX Benchmark Function ---
def benchmark_mlx_simulated(dummy_embedding_table, sentences):
# 1. Simulate tokenization (result is just shape)
batch_size = len(sentences)
input_ids = mx.random.randint(0, 30000, (batch_size, SEQ_LENGTH))
attention_mask = mx.ones((batch_size, SEQ_LENGTH))
start_time = time.time()
# 2. Simulate embedding lookup
embeddings = dummy_embedding_table[input_ids]
# 3. Simulate mean pooling
mask = mx.expand_dims(attention_mask, -1)
sum_embeddings = (embeddings * mask).sum(axis=1)
sum_mask = mask.sum(axis=1)
_ = sum_embeddings / sum_mask
mx.eval() # Ensure all MLX computations are finished
end_time = time.time()
return (end_time - start_time) * 1000 # Return time in ms
# --- Main Execution ---
def main():
print("--- Initializing Models ---")
# Load real PyTorch model
print(f"Loading PyTorch model: {MODEL_NAME_TORCH}")
device = "mps" if torch.backends.mps.is_available() else "cpu"
if device == "cpu":
print("Warning: MPS not available for PyTorch. Benchmark will run on CPU.")
model_torch = SentenceTransformer(MODEL_NAME_TORCH, device=device)
print(f"PyTorch model loaded on: {device}")
# Create dummy MLX embedding table
print("Creating simulated MLX model...")
dummy_vocab_size = 30522 # Typical BERT vocab size
dummy_embedding_table_mlx = mx.random.normal((dummy_vocab_size, EMBED_DIM))
mx.eval() # Ensure table is created
print("Simulated MLX model created.")
# --- Warm-up ---
print("\n--- Performing Warm-up Runs ---")
for _ in range(WARMUP_RUNS):
benchmark_torch(model_torch, DUMMY_SENTENCES[:1])
benchmark_mlx_simulated(dummy_embedding_table_mlx, DUMMY_SENTENCES[:1])
print("Warm-up complete.")
# --- Benchmarking ---
print("\n--- Starting Benchmark ---")
results_torch = []
results_mlx = []
for batch_size in BATCH_SIZES:
print(f"Benchmarking batch size: {batch_size}")
sentences_batch = DUMMY_SENTENCES[:batch_size]
# Benchmark PyTorch
torch_times = [benchmark_torch(model_torch, sentences_batch) for _ in range(NUM_RUNS)]
results_torch.append(np.mean(torch_times))
# Benchmark MLX
mlx_times = [
benchmark_mlx_simulated(dummy_embedding_table_mlx, sentences_batch)
for _ in range(NUM_RUNS)
]
results_mlx.append(np.mean(mlx_times))
print("\n--- Benchmark Results (Average time per batch in ms) ---")
print(f"Batch Sizes: {BATCH_SIZES}")
print(f"PyTorch (mps): {[f'{t:.2f}' for t in results_torch]}")
print(f"MLX (simulated): {[f'{t:.2f}' for t in results_mlx]}")
# --- Plotting ---
print("\n--- Generating Plot ---")
plt.figure(figsize=(10, 6))
plt.plot(BATCH_SIZES, results_torch, marker="o", linestyle="-", label=f"PyTorch ({device})")
plt.plot(BATCH_SIZES, results_mlx, marker="s", linestyle="-", label="MLX (Simulated)")
plt.title("Simulated Embedding Performance: MLX vs PyTorch")
plt.xlabel("Batch Size")
plt.ylabel("Average Time per Batch (ms)")
plt.xticks(BATCH_SIZES)
plt.grid(True)
plt.legend()
output_filename = "embedding_benchmark_simulated.png"
plt.savefig(output_filename)
print(f"Plot saved to {output_filename}")
if __name__ == "__main__":
main()