LEANN/test/simple_mac_tpt_test.py

import time
from dataclasses import dataclass
from typing import Dict, List

import numpy as np
import torch
from torch import nn
from transformers import AutoModel, BitsAndBytesConfig
from tqdm import tqdm

# Add MLX imports
try:
    import mlx.core as mx
    from mlx_lm.utils import load
    MLX_AVAILABLE = True
except ImportError as e:
    print("MLX not available. Install with: uv pip install mlx mlx-lm")
    MLX_AVAILABLE = False

@dataclass
class BenchmarkConfig:
    model_path: str = "facebook/contriever"
    batch_sizes: List[int] = None
    seq_length: int = 256
    num_runs: int = 5
    use_fp16: bool = True
    use_int4: bool = False
    use_int8: bool = False
    use_cuda_graphs: bool = False
    use_flash_attention: bool = False
    use_linear8bitlt: bool = False
    use_mlx: bool = False  # New flag for MLX testing

    def __post_init__(self):
        if self.batch_sizes is None:
            self.batch_sizes = [1, 2, 4, 8, 16, 32, 64]

class MLXBenchmark:
    """MLX-specific benchmark for embedding models"""

    def __init__(self, config: BenchmarkConfig):
        self.config = config
        self.model, self.tokenizer = self._load_model()

    def _load_model(self):
        """Load MLX model and tokenizer following the API pattern"""
        print(f"Loading MLX model from {self.config.model_path}...")
        try:
            model, tokenizer = load(self.config.model_path)
            print("MLX model loaded successfully")
            return model, tokenizer
        except Exception as e:
            print(f"Error loading MLX model: {e}")
            raise

    def _create_random_batch(self, batch_size: int):
        """Create random input batches for MLX testing - same as PyTorch"""
        return torch.randint(
            0, 1000,
            (batch_size, self.config.seq_length),
            dtype=torch.long
        )

    def _run_inference(self, input_ids: torch.Tensor) -> float:
        """Run MLX inference with same input as PyTorch"""
        start_time = time.time()
        try:
            # Convert PyTorch tensor to MLX array
            input_ids_mlx = mx.array(input_ids.numpy())

            # Get embeddings
            embeddings = self.model(input_ids_mlx)

            # Mean pooling (following the API pattern)
            pooled = embeddings.mean(axis=1)

            # Convert to numpy (following the API pattern)
            pooled_numpy = np.array(pooled.tolist(), dtype=np.float32)

            # Force computation
            _ = pooled_numpy.shape

        except Exception as e:
            print(f"MLX inference error: {e}")
            return float('inf')
        end_time = time.time()

        return end_time - start_time

    def run(self) -> Dict[int, Dict[str, float]]:
        """Run the MLX benchmark across all batch sizes"""
        results = {}

        print(f"Starting MLX benchmark with model: {self.config.model_path}")
        print(f"Testing batch sizes: {self.config.batch_sizes}")

        for batch_size in self.config.batch_sizes:
            print(f"\n=== Testing MLX batch size: {batch_size} ===")
            times = []

            # Create input batch (same as PyTorch)
            input_ids = self._create_random_batch(batch_size)

            # Warm up
            print("Warming up...")
            for _ in range(3):
                try:
                    self._run_inference(input_ids[:2])  # Warm up with smaller batch
                except Exception as e:
                    print(f"Warmup error: {e}")
                    break

            # Run benchmark
            for i in tqdm(range(self.config.num_runs), desc=f"MLX Batch size {batch_size}"):
                try:
                    elapsed_time = self._run_inference(input_ids)
                    if elapsed_time != float('inf'):
                        times.append(elapsed_time)
                except Exception as e:
                    print(f"Error during MLX inference: {e}")
                    break

            if not times:
                print(f"Skipping batch size {batch_size} due to errors")
                continue

            # Calculate statistics
            avg_time = np.mean(times)
            std_time = np.std(times)
            throughput = batch_size / avg_time

            results[batch_size] = {
                "avg_time": avg_time,
                "std_time": std_time,
                "throughput": throughput,
                "min_time": np.min(times),
                "max_time": np.max(times),
            }

            print(f"MLX Results for batch size {batch_size}:")
            print(f"  Avg Time: {avg_time:.4f}s ± {std_time:.4f}s")
            print(f"  Min Time: {np.min(times):.4f}s")
            print(f"  Max Time: {np.max(times):.4f}s")
            print(f"  Throughput: {throughput:.2f} sequences/second")

        return results

class Benchmark:
    def __init__(self, config: BenchmarkConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        self.model = self._load_model()

    def _load_model(self) -> nn.Module:
        print(f"Loading model from {self.config.model_path}...")


        model = AutoModel.from_pretrained(self.config.model_path)
        if self.config.use_fp16:
            model = model.half()
        model = torch.compile(model)
        model = model.to(self.device)

        model.eval()
        return model

    def _create_random_batch(self, batch_size: int) -> torch.Tensor:
        return torch.randint(
            0, 1000,
            (batch_size, self.config.seq_length),
            device=self.device,
            dtype=torch.long
        )

    def _run_inference(self, input_ids: torch.Tensor) -> float:
        attention_mask = torch.ones_like(input_ids)

        start_time = time.time()
        with torch.no_grad():
            output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        end_time = time.time()

        return end_time - start_time

    def run(self) -> Dict[int, Dict[str, float]]:
        results = {}

        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()

        for batch_size in self.config.batch_sizes:
            print(f"\nTesting batch size: {batch_size}")
            times = []

            input_ids = self._create_random_batch(batch_size)

            for i in tqdm(range(self.config.num_runs), desc=f"Batch size {batch_size}"):
                try:
                    elapsed_time = self._run_inference(input_ids)
                    times.append(elapsed_time)
                except Exception as e:
                    print(f"Error during inference: {e}")
                    break

            if not times:
                continue

            avg_time = np.mean(times)
            std_time = np.std(times)
            throughput = batch_size / avg_time

            results[batch_size] = {
                "avg_time": avg_time,
                "std_time": std_time,
                "throughput": throughput,
            }

            print(f"Avg Time: {avg_time:.4f}s ± {std_time:.4f}s")
            print(f"Throughput: {throughput:.2f} sequences/second")

        if torch.cuda.is_available():
            peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 ** 3)
        else:
            peak_memory_gb = 0.0

        for batch_size in results:
            results[batch_size]["peak_memory_gb"] = peak_memory_gb

        return results

def run_benchmark():
    """Main function to run the benchmark with optimized parameters."""
    config = BenchmarkConfig()

    try:
        benchmark = Benchmark(config)
        results = benchmark.run()

        max_throughput = max(results[batch_size]["throughput"] for batch_size in results)
        avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results])

        return {
            "max_throughput": max_throughput,
            "avg_throughput": avg_throughput,
            "results": results
        }

    except Exception as e:
        print(f"Benchmark failed: {e}")
        return {
            "max_throughput": 0.0,
            "avg_throughput": 0.0,
            "error": str(e)
        }

def run_mlx_benchmark():
    """Run MLX-specific benchmark"""
    if not MLX_AVAILABLE:
        print("MLX not available, skipping MLX benchmark")
        return {
            "max_throughput": 0.0,
            "avg_throughput": 0.0,
            "error": "MLX not available"
        }

    config = BenchmarkConfig(
        model_path="mlx-community/all-MiniLM-L6-v2-4bit",
        use_mlx=True
    )

    try:
        benchmark = MLXBenchmark(config)
        results = benchmark.run()

        if not results:
            return {
                "max_throughput": 0.0,
                "avg_throughput": 0.0,
                "error": "No valid results"
            }

        max_throughput = max(results[batch_size]["throughput"] for batch_size in results)
        avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results])

        return {
            "max_throughput": max_throughput,
            "avg_throughput": avg_throughput,
            "results": results
        }

    except Exception as e:
        print(f"MLX benchmark failed: {e}")
        return {
            "max_throughput": 0.0,
            "avg_throughput": 0.0,
            "error": str(e)
        }

if __name__ == "__main__":
    print("=== PyTorch Benchmark ===")
    pytorch_result = run_benchmark()
    print(f"PyTorch Max throughput: {pytorch_result['max_throughput']:.2f} sequences/second")
    print(f"PyTorch Average throughput: {pytorch_result['avg_throughput']:.2f} sequences/second")

    print("\n=== MLX Benchmark ===")
    mlx_result = run_mlx_benchmark()
    print(f"MLX Max throughput: {mlx_result['max_throughput']:.2f} sequences/second")
    print(f"MLX Average throughput: {mlx_result['avg_throughput']:.2f} sequences/second")

    # Compare results
    if pytorch_result['max_throughput'] > 0 and mlx_result['max_throughput'] > 0:
        speedup = mlx_result['max_throughput'] / pytorch_result['max_throughput']
        print(f"\n=== Comparison ===")
        print(f"MLX is {speedup:.2f}x {'faster' if speedup > 1 else 'slower'} than PyTorch")