first commit

2026-03-22 17:26:26 -04:00
commit c05cb71816
15 changed files with 2644 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,77 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual Environments
 .venv
 .vllm/
 venv/
 ENV/
 env/
 # IDEs
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 Thumbs.db
 # Logs
 *.log
 vllm-server.log
 *.out
 *.err
 # Build artifacts
 *.o
 *.a
 *.so
 *.dylib
 *.dll
 CMakeCache.txt
 CMakeFiles/
 cmake_install.cmake
 Makefile
 # CUDA
 *.ptx
 *.cubin
 # Local installation directories
 triton/
 vllm/
 .cache/
 # Temporary files
 tmp/
 temp/
 *.tmp
 *.bak
 # Model downloads
 models/
 *.safetensors
 *.bin
 *.gguf
--- a/CLUSTER.md
+++ b/CLUSTER.md
@@ -0,0 +1,380 @@
 # vLLM Cluster Mode Setup for DGX Spark
 This guide covers setting up multi-node vLLM deployment on DGX Spark systems using distributed inference.
 ## Prerequisites
 - Multiple DGX Spark systems with vLLM installed (use `install.sh` on each node)
 - All nodes on the same network with direct connectivity
 - SSH access between nodes (passwordless SSH recommended)
 - Same CUDA and vLLM versions across all nodes
 ## Architecture
 ```
 ┌─────────────────────┐
 │   spark-alpha       │
 │   (Master/Head)     │
 │   - API Server      │
 │   - Request Router  │
 │   - Model Weights   │
 └──────────┬──────────┘
           │
           ├─────────────────────┐
           │                     │
 ┌──────────▼──────────┐  ┌──────▼──────────┐
 │   spark-omega       │  │   spark-gamma   │
 │   (Worker 1)        │  │   (Worker 2)    │
 │   - Inference       │  │   - Inference   │
 │   - GPU Compute     │  │   - GPU Compute │
 └─────────────────────┘  └─────────────────┘
 ```
 ## Step 1: Install vLLM on All Nodes
 Run the installer on each node:
 ```bash
 # On spark-alpha (master)
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
 # On spark-omega (worker 1)
 ssh spark-omega.local
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
 # On spark-gamma (worker 2)
 ssh spark-gamma.local
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
 ```
 ## Step 2: Configure Network Settings
 Ensure all nodes can communicate on the required ports:
 - **8000**: vLLM API server (master only)
 - **29500**: PyTorch distributed backend (all nodes)
 - **Random ports**: Ray cluster communication
 Open firewall if needed:
 ```bash
 # On all nodes
 sudo ufw allow 8000/tcp
 sudo ufw allow 29500/tcp
 sudo ufw allow 6379/tcp   # Ray GCS
 sudo ufw allow 8265/tcp   # Ray Dashboard
 ```
 ## Step 3: Set Up Passwordless SSH (Optional but Recommended)
 ```bash
 # On master node
 ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
 # Copy to worker nodes
 ssh-copy-id spark-omega.local
 ssh-copy-id spark-gamma.local
 # Verify
 ssh spark-omega.local "echo 'Connection successful'"
 ssh spark-gamma.local "echo 'Connection successful'"
 ```
 ## Step 4: Start Ray Cluster
 ### On Master Node (spark-alpha)
 ```bash
 # Assuming vllm-install is in your home directory
 source ~/vllm-install/vllm_env.sh
 # Start Ray head node
 ray start --head \
  --port=6379 \
  --dashboard-host=0.0.0.0 \
  --dashboard-port=8265 \
  --num-gpus=1
 # Note the output: "To connect to this Ray cluster, use: ray start --address='MASTER_IP:6379'"
 ```
 ### On Worker Nodes (spark-omega, spark-gamma)
 ```bash
 source ~/vllm-install/vllm_env.sh
 # Replace MASTER_IP with spark-alpha's IP address
 ray start --address='MASTER_IP:6379' --num-gpus=1
 ```
 Verify cluster status:
 ```bash
 ray status
 ```
 You should see all nodes listed.
 ## Step 5: Start vLLM with Tensor Parallelism
 ### Method 1: Tensor Parallelism (Recommended for Large Models)
 Tensor parallelism splits model layers across multiple GPUs.
 ```bash
 # On master node
 source ~/vllm-install/vllm_env.sh
 vllm serve \
  --model "meta-llama/Llama-3.1-70B-Instruct" \
  --tensor-parallel-size 2 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 8000
 ```
 This will automatically distribute the model across 2 GPUs in the Ray cluster.
 ### Method 2: Pipeline Parallelism
 Pipeline parallelism splits model stages across GPUs.
 ```bash
 vllm serve \
  --model "meta-llama/Llama-3.1-70B-Instruct" \
  --pipeline-parallel-size 2 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 8000
 ```
 ### Method 3: Combined Parallelism
 For very large models, combine tensor and pipeline parallelism:
 ```bash
 vllm serve \
  --model "meta-llama/Llama-3.1-405B-Instruct" \
  --tensor-parallel-size 4 \
  --pipeline-parallel-size 2 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 8000
 ```
 ## Step 6: Test Cluster Inference
 ```bash
 # Test from master node
 curl http://localhost:8000/v1/models
 # Test from external machine
 curl http://spark-alpha.local:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "meta-llama/Llama-3.1-70B-Instruct",
    "prompt": "Explain distributed inference in 3 sentences.",
    "max_tokens": 100,
    "temperature": 0.7
  }'
 ```
 ## Step 7: Monitor Cluster
 ### Ray Dashboard
 Access at: http://spark-alpha.local:8265
 Shows:
 - Node status and resources
 - Task execution
 - GPU utilization
 - Memory usage
 ### vLLM Metrics
 ```bash
 # On master node
 tail -f ~/vllm-install/vllm-server.log
 # Check GPU usage across cluster
 ray exec 'nvidia-smi'
 ```
 ### System Monitoring
 ```bash
 # Check Ray cluster status
 ray status
 # Monitor GPU usage on specific node
 ssh spark-omega.local nvidia-smi -l 1
 ```
 ## Troubleshooting
 ### Workers Not Connecting
 **Problem**: Workers can't connect to Ray head node
 **Solutions**:
 1. Check firewall: `sudo ufw status`
 2. Verify head node IP: `ray status` on master
 3. Check network connectivity: `ping spark-alpha.local`
 4. Ensure same Ray version on all nodes: `ray --version`
 ### OOM Errors with Large Models
 **Problem**: Out of memory when loading large models
 **Solutions**:
 1. Increase tensor parallelism: `--tensor-parallel-size 4`
 2. Reduce memory utilization: `--gpu-memory-utilization 0.8`
 3. Enable CPU offloading: `--cpu-offload-gb 8`
 4. Use quantization: `--quantization awq` or `--quantization gptq`
 ### Model Loading Hangs
 **Problem**: Model download/loading takes forever
 **Solutions**:
 1. Pre-download model on all nodes:
   ```bash
   # On each node
   python -c "from transformers import AutoModel; AutoModel.from_pretrained('meta-llama/Llama-3.1-70B-Instruct')"
   ```
 2. Use shared storage (NFS) for model cache
 3. Check network bandwidth between nodes
 ### Uneven GPU Utilization
 **Problem**: Some GPUs idle while others maxed out
 **Solutions**:
 1. Verify tensor parallel configuration
 2. Check Ray resource allocation: `ray status`
 3. Ensure balanced request distribution
 4. Monitor with: `ray exec 'nvidia-smi'`
 ## Advanced Configuration
 ### Custom Ray Resources
 Assign custom resources to nodes for fine-grained control:
 ```bash
 # On worker with high memory
 ray start --address='MASTER_IP:6379' \
  --num-gpus=1 \
  --resources='{"highmem": 1}'
 # Use in vLLM
 vllm serve --model "..." --placement-group-resources='{"highmem": 1}'
 ```
 ### Distributed Model Cache
 Share model weights via NFS to avoid redundant downloads:
 ```bash
 # On NFS server (e.g., master)
 sudo apt install nfs-kernel-server
 echo "$HOME/.cache/huggingface *(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
 sudo exportfs -a
 # On workers
 sudo apt install nfs-common
 sudo mkdir -p $HOME/.cache/huggingface
 sudo mount spark-alpha.local:$HOME/.cache/huggingface $HOME/.cache/huggingface
 ```
 ### Load Balancing with nginx
 For production deployments, use nginx to load balance across multiple vLLM instances:
 ```nginx
 upstream vllm_cluster {
    least_conn;
    server spark-alpha.local:8000;
    server spark-omega.local:8000;
    server spark-gamma.local:8000;
 }
 server {
    listen 80;
    location / {
        proxy_pass http://vllm_cluster;
        proxy_set_header Host $host;
    }
 }
 ```
 ## Cluster Management Scripts
 ### Start Cluster
 Create `start-cluster.sh`:
 ```bash
 #!/bin/bash
 # Start Ray cluster on all nodes
 ssh spark-alpha.local "source ~/vllm-install/vllm_env.sh && ray start --head --port=6379"
 sleep 5
 MASTER_IP=$(ssh spark-alpha.local "hostname -I | awk '{print \$1}'")
 ssh spark-omega.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
 ssh spark-gamma.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
 echo "Cluster started. Check status with: ray status"
 ```
 ### Stop Cluster
 Create `stop-cluster.sh`:
 ```bash
 #!/bin/bash
 # Stop Ray cluster on all nodes
 for node in spark-alpha.local spark-omega.local spark-gamma.local; do
    echo "Stopping Ray on $node..."
    ssh $node "ray stop --force"
 done
 echo "Cluster stopped."
 ```
 ## Performance Tuning
 ### For Maximum Throughput
 ```bash
 vllm serve \
  --model "meta-llama/Llama-3.1-70B-Instruct" \
  --tensor-parallel-size 2 \
  --max-num-seqs 256 \
  --max-num-batched-tokens 8192 \
  --gpu-memory-utilization 0.95
 ```
 ### For Low Latency
 ```bash
 vllm serve \
  --model "meta-llama/Llama-3.1-70B-Instruct" \
  --tensor-parallel-size 2 \
  --max-num-seqs 32 \
  --disable-log-requests
 ```
 ## References
 - [vLLM Distributed Inference](https://docs.vllm.ai/en/latest/serving/distributed_serving.html)
 - [Ray Cluster Setup](https://docs.ray.io/en/latest/cluster/getting-started.html)
 - [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html)
 ## Support
 For issues specific to DGX Spark cluster setup, please open an issue on GitHub.
--- a/CRITICAL_FIX_ANALYSIS.md
+++ b/CRITICAL_FIX_ANALYSIS.md
@@ -0,0 +1,134 @@
 # Critical Blackwell GB10 Fixes for vLLM
 ## Overview
 Three critical fixes are required for vLLM on Blackwell GB10 (sm_121a) GPUs with CUDA 13.0+:
 1. **CMakeLists.txt SM120 Support** - Add missing architecture
 2. **vLLM Commit Version** - Use commit with Blackwell/Triton fixes
 3. **Triton Version Pinning** - Use tested working commit
 ## Fix 1: CMakeLists.txt SM120 Support
 ### Root Cause
 vLLM v0.11.1rc3 CMakeLists.txt has **incomplete architecture support** for Blackwell GB10 (sm_121a) MOE kernels when using CUDA 13.0+.
 ## The Problem
 For CUDA 13.0+, the code uses these branches:
 - **Line 490**: Regular MOE kernels
 - **Line 671**: Grouped MM MOE kernels
 Original v0.11.1rc3:
 ```cmake
 # Line 490
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
 # Line 671
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
 ```
 **BOTH lines are missing `12.0f` (SM120) support!**
 ## The Fix
 Both lines need `12.0f` added:
 ```cmake
 # Line 490
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 # Line 671
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 ```
 ## Error Symptoms
 Without this fix:
 ```
 ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100RN2at6TensorERKS0_S3_S3_S3_S3_S3_S3_S3_S3_bb
 ```
 The MOE kernels for SM100/SM120 aren't compiled, causing import failures.
 ## Why install.sh Works
 The sed command on line 323:
 ```bash
 sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
 ```
 This replaces **ALL** occurrences, fixing both lines 490 and 671 in one command.
 ## Verified Solution
 Tested on NVIDIA DGX Spark with Blackwell GB10, CUDA 13.0:
 - [OK] Line 490 fixed: `"10.0f;11.0f;12.0f"`
 - [OK] Line 671 fixed: `"10.0f;11.0f;12.0f"`
 - [OK] vLLM imports successfully
 - [OK] No cutlass_moe_mm_sm100 symbol errors
 - [OK] Build time: ~19 minutes
 ## Fix 2: vLLM Commit Version
 ### Issue
 vLLM tag `v0.11.1rc3` lacks critical Triton/PyTorch Inductor fixes for Blackwell.
 ### Solution
 Use commit `66a168a197ba214a5b70a74fa2e713c9eeb3251a` (6 commits ahead of v0.11.1rc3):
 - Contains Triton JIT compilation fixes
 - Includes PyTorch Inductor optimizations for Blackwell
 - Adds proper backend registration handling
 ### Installation
 ```bash
 cd vllm
 git checkout 66a168a197ba214a5b70a74fa2e713c9eeb3251a
 git submodule update --init --recursive
 ```
 ## Fix 3: Triton Version Pinning
 ### Issue
 Latest Triton main branch (as of late October 2025) has intermittent JITFunction compilation issues with PyTorch Inductor on Blackwell.
 ### Solution
 Pin to tested working commit: `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` (October 25, 2025)
 - Verified stable with Blackwell GB10
 - Passes all compilation tests
 - No JITFunction.constexprs errors
 ### Installation
 ```bash
 cd triton
 git checkout 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
 git submodule update --init --recursive
 python -m pip install --no-build-isolation -v .
 ```
 ## Complete Verified Configuration
 | Component | Version/Commit | Notes |
 |-----------|---------------|-------|
 | **vLLM** | `66a168a197ba214a5b70a74fa2e713c9eeb3251a` | 6 commits ahead of v0.11.1rc3 |
 | **Triton** | `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` | October 25, 2025 |
 | **PyTorch** | `2.9.0+cu130` | From vLLM requirements |
 | **CUDA** | `13.0` (V13.0.88) | System CUDA |
 | **Python** | `3.12.3` | |
 ## Testing
 Verified working with:
 ```bash
 python -c "from vllm import LLM, SamplingParams; \
 llm = LLM(model='Qwen/Qwen2.5-0.5B-Instruct', max_model_len=512); \
 print(llm.generate(['Hello'], SamplingParams(max_tokens=20)))"
 ```
 **All tests pass**: Import, compilation, CUDA graphs, and text generation all work correctly.
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2025 DGX Spark Community
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,312 @@
 # vLLM Setup for NVIDIA DGX Spark (Blackwell GB10)
 **One-command installation** of vLLM for NVIDIA DGX Spark systems with GB10 GPUs (Blackwell architecture, sm_121).
 This repository provides a dgx-spark tested, ready setup script that handles all the complexities of building vLLM on the DGX Spark platform, including:
 - CUDA 13.0 support with Blackwell-specific optimizations
 - Critical fixes for SM100/SM120 MOE kernel compilation
 - Triton 3.5.0 from main branch (required for sm_121a support)
 - PyTorch 2.9.0 with CUDA 13.0 bindings
 - All necessary build fixes and workarounds
 ## Quick Start
 **One-command installation** - installs to `./vllm-install` in your current directory:
 ```bash
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
 ```
 Or specify a custom directory:
 ```bash
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash -s -- --install-dir ~/my/custom/path
 ```
 **Installation time:** ~20-30 minutes (mostly compilation)
 ### Alternative: Clone and Install
 ```bash
 git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
 cd dgx-spark-vllm-setup
 ./install.sh
 ```
 ### Installation Options
 ```bash
 ./install.sh [OPTIONS]
 Options:
  --install-dir DIR    Installation directory (default: ./vllm-install)
  --vllm-version TAG   vLLM git tag/branch (default: v0.11.1rc3)
  --python-version VER Python version (default: 3.12)
  --skip-tests         Skip post-installation tests
  --help               Show help message
 ```
 ## System Requirements
 - **Hardware:** NVIDIA DGX Spark with GB10 GPU (Blackwell sm_121)
 - **OS:** Ubuntu 22.04+ (tested on Linux 6.11.0 ARM64)
 - **CUDA:** 13.0 or later (driver 580.95.05+)
 - **Disk Space:** ~50GB free
 - **RAM:** 8GB+ recommended during build
 ## What Gets Installed
 Installed to `./vllm-install` (or your custom directory):
 - **Python 3.12** virtual environment at `.vllm/`
 - **PyTorch 2.9.0+cu130** with full CUDA 13.0 support
 - **Triton 3.5.0+git** from main branch (pre-release with Blackwell support)
 - **vLLM 0.11.1rc3+** with all Blackwell-specific patches
 - **Helper scripts** for managing vLLM server
 - **Environment activation** script (`vllm_env.sh`)
 ## Usage
 All examples assume you're in the installation directory (default: `./vllm-install`).
 ### Activate Environment
 ```bash
 cd vllm-install
 source vllm_env.sh
 ```
 ### Start vLLM Server
 ```bash
 ./vllm-serve.sh                                    # Default: Qwen2.5-0.5B on port 8000
 ./vllm-serve.sh "facebook/opt-125m" 8001          # Custom model and port
 ```
 ### Check Server Status
 ```bash
 ./vllm-status.sh
 ```
 ### Stop Server
 ```bash
 ./vllm-stop.sh
 ```
 ### Test API
 ```bash
 # List models
 curl http://localhost:8000/v1/models
 # Generate completion
 curl http://localhost:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen2.5-0.5B-Instruct",
    "prompt": "Hello, how are you?",
    "max_tokens": 50
  }'
 ```
 ### Python API
 ```python
 from vllm import LLM, SamplingParams
 llm = LLM(
    model="Qwen/Qwen2.5-0.5B-Instruct",
    trust_remote_code=True,
    gpu_memory_utilization=0.9
 )
 prompts = ["Tell me about DGX Spark"]
 sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
 outputs = llm.generate(prompts, sampling_params)
 print(outputs[0].outputs[0].text)
 ```
 ## Critical Fixes Applied
 This installer automatically applies the following critical fixes:
 ### 1. CMakeLists.txt SM100/SM120 MOE Kernel Fix
 **Issue:** vLLM's MOE kernels for SM100/SM120 Blackwell architectures were incomplete
 **Fix:** Added `12.0f` and `12.1a` to SCALED_MM_ARCHS in CMakeLists.txt
 ```cmake
 # CUDA 13.0+ path (line ~671)
 # Before
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
 # After
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 # Older CUDA path (line ~673)
 # Before
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
 # After
 cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a" "${CUDA_ARCHS}")
 ```
 ### 2. pyproject.toml License Field Format
 **Issue:** Newer setuptools requires structured license format
 **Fix:** Convert license string to dict format in both vLLM and flashinfer-python
 ```toml
 # Before
 license = "Apache-2.0"
 license-files = ["LICENSE"]
 # After
 license = {text = "Apache-2.0"}
 ```
 **Applied to:**
 - vLLM's pyproject.toml
 - flashinfer-python's pyproject.toml (patched during build)
 ### 3. GPT-OSS Triton MOE Kernels for Qwen3/gpt-oss Support
 **Issue:** vLLM's GPT-OSS MOE kernel implementation uses deprecated Triton routing API
 **Fix:** Update to new Triton kernel API (topk and SparseMatrix)
 **Changes:**
 - Replace deprecated `routing()` with `triton_topk()`
 - Replace deprecated `routing_from_bitmatrix()` with `SparseMatrix()`
 - Add support for `GatherIndx`, `ScatterIndx`, and new ragged tensor metadata
 **Enables support for:**
 - Qwen3 models with MOE architecture
 - gpt-oss models using Triton kernels
 - Latest Triton kernel optimizations for Blackwell
 ### 4. Triton Main Branch Requirement
 **Issue:** Official Triton 3.5.0 release has bugs with sm_121a
 **Fix:** Build Triton from main branch with latest Blackwell fixes
 ## Architecture-Specific Configuration
 The installer sets these critical environment variables:
 ```bash
 TORCH_CUDA_ARCH_LIST=12.1a                      # Blackwell sm_121
 VLLM_USE_FLASHINFER_MXFP4_MOE=1                 # Enable FlashInfer MOE optimization
 TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas     # CUDA PTX assembler
 TIKTOKEN_CACHE_DIR=$INSTALL_DIR/.tiktoken_cache # Cache tiktoken encodings locally
 ```
 ## Cluster Mode Setup
 To set up multi-node vLLM cluster:
 1. Run this installer on all nodes
 2. Follow [CLUSTER.md](./CLUSTER.md) for configuration
 ## Troubleshooting
 ### Build Fails with "TypeError: can only concatenate str (not 'NoneType') to str"
 This is a known Triton editable-mode build issue. The installer works around this by:
 - Building Triton in non-editable mode
 - Or copying pre-built Triton from another node
 ### Symbol Error: cutlass_moe_mm_sm100
 **Symptom:** `ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100`
 **Solution:** Ensure CMakeLists.txt fix is applied (done automatically by installer)
 ### PyTorch CUDA Capability Warning
 **Symptom:** Warning about GPU capability 12.1 vs PyTorch max 12.0
 **Status:** Harmless warning - PyTorch 2.9.0+cu130 works correctly with GB10
 ### ImportError: No module named 'vllm'
 **Solution:**
 ```bash
 source vllm-install/vllm_env.sh
 python -c "import vllm; print(vllm.__version__)"
 ```
 ## File Structure
 ```
 vllm-install/
 ├── .vllm/                  # Python virtual environment
 ├── vllm/                   # vLLM source (editable install)
 ├── triton/                 # Triton source
 ├── vllm_env.sh            # Environment activation script
 ├── vllm-serve.sh          # Start server
 ├── vllm-stop.sh           # Stop server
 ├── vllm-status.sh         # Check status
 └── vllm-server.log        # Server logs
 ```
 ## Manual Installation
 If you prefer to understand each step:
 ```bash
 # 1. Install uv package manager
 curl -LsSf https://astral.sh/uv/install.sh | sh
 export PATH="$HOME/.local/bin:$PATH"
 # 2. Create installation directory and Python virtual environment
 mkdir -p vllm-install && cd vllm-install
 uv venv .vllm --python 3.12
 source .vllm/bin/activate
 # 3. Install PyTorch with CUDA 13.0
 uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
 # 4. Clone and build Triton from main
 git clone https://github.com/triton-lang/triton.git
 cd triton
 uv pip install pip cmake ninja pybind11
 TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas python -m pip install --no-build-isolation .
 # 5. Install additional dependencies
 uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
 # 6. Clone vLLM
 cd ..
 git clone --recursive https://github.com/vllm-project/vllm.git
 cd vllm
 git checkout v0.11.1rc3
 # 7. Apply fixes (see scripts/apply-fixes.sh)
 # 8. Build vLLM (see install.sh for full process)
 ```
 ## Version Information
 - **vLLM:** 0.11.1rc4.dev6+g66a168a19.d20251026
 - **PyTorch:** 2.9.0+cu130
 - **Triton:** 3.5.0+git4caa0328
 - **CUDA:** 13.0
 - **Python:** 3.12.3
 - **Target Architecture:** sm_121 (Blackwell GB10)
 ## Contributing
 Issues and pull requests welcome! This installer is maintained by the DGX Spark community.
 ## References
 - [NVIDIA Forum Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
 - [vLLM GitHub](https://github.com/vllm-project/vllm)
 - [Triton GitHub](https://github.com/triton-lang/triton)
 ## License
 MIT License - See [LICENSE](./LICENSE)
 ## Acknowledgments
 Developed and tested on NVIDIA DGX Spark systems. Special thanks to the vLLM and Triton communities.
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -0,0 +1,246 @@
 # Repository Summary
 ## Overview
 This repository provides a **production-ready, one-command installation** of vLLM for NVIDIA DGX Spark systems with Blackwell GB10 GPUs (sm_121 architecture).
 ## What's Included
 ### Core Files
 1. **install.sh** (500+ lines)
   - Fully automated installation script
   - Pre-flight system checks
   - 8-step installation pipeline
   - Post-installation testing
   - Command-line argument support
 2. **README.md** (300+ lines)
   - Quick start guide
   - System requirements
   - Usage examples
   - Critical fixes documentation
   - Troubleshooting guide
 3. **CLUSTER.md** (400+ lines)
   - Multi-node setup instructions
   - Ray cluster configuration
   - Tensor/pipeline parallelism
   - Performance tuning
   - Load balancing examples
 4. **requirements.txt**
   - Complete dependency list
   - PyTorch 2.9.0+cu130
   - All required packages
 ### Helper Scripts (scripts/)
 - **vllm-serve.sh** - Start vLLM server with configurable model/port
 - **vllm-stop.sh** - Gracefully stop server
 - **vllm-status.sh** - Check server status and logs
 ### Examples (examples/)
 - **basic_inference.py** - Simple Python API usage
 - **api_client.py** - OpenAI-compatible REST API client
 - **README.md** - Usage instructions and API examples
 ### Configuration
 - **.gitignore** - Excludes build artifacts, venvs, logs
 - **LICENSE** - MIT license
 ## Technical Specifications
 ### Target Platform
 - **Hardware:** NVIDIA DGX Spark with GB10 GPU
 - **Architecture:** Blackwell sm_121 (compute capability 12.1)
 - **OS:** Ubuntu 22.04+ ARM64
 - **CUDA:** 13.0+ (driver 580.95.05+)
 ### Software Stack
 - **Python:** 3.12.3
 - **PyTorch:** 2.9.0+cu130
 - **Triton:** 3.5.0+git (from main branch)
 - **vLLM:** 0.11.1rc4+
 - **Package Manager:** uv (fast Python package installer)
 ### Critical Fixes Applied
 1. **CMakeLists.txt (line 671)**
   - Added `12.0f` to SCALED_MM_ARCHS for SM100 MOE kernels
   - Enables Blackwell GPU compilation
 2. **pyproject.toml**
   - Changed `license = "Apache-2.0"` to `license = {text = "Apache-2.0"}`
   - Removed deprecated `license-files` field
   - Compatible with setuptools 77.0+
 3. **Triton Build**
   - Must use main branch (not release 3.5.0)
   - Non-editable install to avoid setuptools bug
   - Custom PTXAS path for CUDA integration
 ### Environment Variables
 ```bash
 TORCH_CUDA_ARCH_LIST=12.1a               # Blackwell architecture
 VLLM_USE_FLASHINFER_MXFP4_MOE=1         # Enable FlashInfer optimization
 TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas  # CUDA PTX assembler
 ```
 ## Installation Overview
 The `install.sh` script performs these steps:
 1. **Pre-flight Checks**
   - Verify ARM64 architecture
   - Check NVIDIA GPU (GB10)
   - Validate CUDA 13.0+
   - Ensure 50GB+ disk space
 2. **Install uv Package Manager**
   - Fast Python package installer
   - Required for efficient dependency resolution
 3. **Create Virtual Environment**
   - Python 3.12 virtual environment
   - Isolated from system packages
 4. **Install PyTorch**
   - PyTorch 2.9.0 with CUDA 13.0 bindings
   - Verify CUDA availability
 5. **Build Triton**
   - Clone from GitHub main branch
   - Build with Blackwell support
   - Non-editable install
 6. **Install Dependencies**
   - xgrammar, setuptools-scm
   - apache-tvm-ffi (prerelease)
   - Build tools
 7. **Clone and Fix vLLM**
   - Clone v0.11.1rc3
   - Apply CMakeLists.txt fix
   - Apply pyproject.toml fix
   - Configure use_existing_torch
 8. **Build vLLM**
   - 15-20 minute compilation
   - All CUDA kernels for Blackwell
   - Editable install for development
 9. **Create Helper Scripts**
   - Environment activation script
   - Server management scripts
   - Logging configuration
 10. **Post-Installation Tests**
    - Import vLLM
    - Check CUDA availability
    - Verify GPU detection
 ## Quick Start
 ```bash
 # One-command installation
 curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
 # Or clone and run
 git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
 cd dgx-spark-vllm-setup
 ./install.sh
 # Activate environment (assuming installation in current directory)
 cd vllm-install
 source vllm_env.sh
 # Start server
 ./vllm-serve.sh
 # Test API
 curl http://localhost:8000/v1/models
 ```
 ## Repository Structure
 ```
 dgx-spark-vllm-setup/
 ├── README.md              # Main documentation
 ├── CLUSTER.md             # Multi-node setup guide
 ├── SUMMARY.md             # This file
 ├── LICENSE                # MIT license
 ├── .gitignore             # Git ignore rules
 ├── install.sh             # Main installation script
 ├── requirements.txt       # Python dependencies
 ├── scripts/
 │   ├── vllm-serve.sh      # Start vLLM server
 │   ├── vllm-stop.sh       # Stop server
 │   └── vllm-status.sh     # Check status
 └── examples/
    ├── README.md          # Examples documentation
    ├── basic_inference.py # Python API example
    └── api_client.py      # REST API example
 ```
 ## Known Issues & Workarounds
 ### Triton Editable Build Fails
 **Error:** `TypeError: can only concatenate str (not 'NoneType') to str`  
 **Workaround:** Use non-editable install (`uv pip install --no-build-isolation .`)
 ### PyTorch CUDA Capability Warning
 **Warning:** GPU capability 12.1 vs PyTorch max 12.0  
 **Status:** Harmless - PyTorch 2.9.0+cu130 works correctly with GB10
 ### apache-tvm-ffi Prerelease
 **Error:** `No solution found when resolving dependencies`  
 **Fix:** Use `--prerelease=allow` flag with uv pip install
 ## Testing Status
 - [OK] Single-node installation on spark-alpha.local
 - [OK] Single-node installation on spark-omega.local
 - [OK] vLLM server startup and API functionality
 - [OK] Model inference (Qwen/Qwen2.5-0.5B-Instruct)
 - [IN PROGRESS] Multi-node cluster mode (documented, not yet tested)
 ## Future Enhancements
 - [ ] Add cluster mode testing results
 - [ ] Include performance benchmarks
 - [ ] Add Dockerfile for containerized deployment
 - [ ] Create Ansible playbook for multi-node automation
 - [ ] Add monitoring and logging setup (Prometheus/Grafana)
 - [ ] Include model quantization examples (AWQ, GPTQ)
 ## Contributing
 Contributions welcome! Please open issues or pull requests on GitHub.
 ## Community & Support
 - **GitHub Issues:** Report bugs and feature requests
 - **NVIDIA Forum:** [DGX Spark vLLM Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
 - **vLLM Docs:** [Official Documentation](https://docs.vllm.ai/)
 ## License
 MIT License - See LICENSE file for details.
 ## Acknowledgments
 Developed and tested on NVIDIA DGX Spark systems. Special thanks to:
 - vLLM project team
 - Triton compiler team
 - NVIDIA DGX Spark community
 - Claude Code (AI assistant) for documentation automation
 ---
 **Version:** 1.0.0  
 **Last Updated:** 2025-10-26  
 **Tested On:** DGX Spark with GB10, CUDA 13.0, Ubuntu 22.04 ARM64
--- a/examples/README.md
+++ b/examples/README.md
@@ -0,0 +1,225 @@
 # vLLM Examples for DGX Spark
 This directory contains example scripts demonstrating various ways to use vLLM on DGX Spark systems.
 ## Prerequisites
 Ensure vLLM is installed and the environment is activated:
 ```bash
 # Assuming vllm-install is in your home directory
 source ~/vllm-install/vllm_env.sh
 ```
 ## Examples
 ### 1. Basic Inference (`basic_inference.py`)
 Simple text generation using the vLLM Python API.
 **Usage:**
 ```bash
 python basic_inference.py
 ```
 **What it demonstrates:**
 - Loading a model with vLLM
 - Configuring sampling parameters
 - Generating multiple completions
 - Batch processing
 ### 2. API Client (`api_client.py`)
 Using vLLM's OpenAI-compatible REST API.
 **Prerequisites:**
 Start the vLLM server first:
 ```bash
 cd ~/vllm-install
 ./vllm-serve.sh
 ```
 **Usage:**
 ```bash
 python api_client.py
 ```
 **What it demonstrates:**
 - Listing available models
 - Simple text completion
 - Chat completion
 - Streaming responses
 - HTTP API interaction
 ### 3. Batch Processing (`batch_processing.py`)
 Efficient processing of large batches of prompts.
 **Usage:**
 ```bash
 python batch_processing.py
 ```
 **What it demonstrates:**
 - High-throughput batch inference
 - Dynamic batching
 - Memory-efficient processing
 - Performance monitoring
 ## Customization
 ### Change Model
 Edit the model name in any example:
 ```python
 llm = LLM(
    model="meta-llama/Llama-3.1-8B-Instruct",  # Change this
    trust_remote_code=True,
    gpu_memory_utilization=0.9
 )
 ```
 ### Adjust Sampling Parameters
 Modify `SamplingParams` for different generation behavior:
 ```python
 sampling_params = SamplingParams(
    temperature=0.7,      # Lower = more deterministic (0.0-1.0)
    top_p=0.95,          # Nucleus sampling threshold
    max_tokens=100,      # Maximum tokens to generate
    top_k=50,            # Top-k sampling
    repetition_penalty=1.1  # Penalize repetition
 )
 ```
 ### GPU Memory Management
 Adjust memory utilization:
 ```python
 llm = LLM(
    model="...",
    gpu_memory_utilization=0.9,  # Use 90% of GPU memory (0.0-1.0)
    max_model_len=2048           # Maximum sequence length
 )
 ```
 ## API Server Examples
 ### cURL Examples
 **List models:**
 ```bash
 curl http://localhost:8000/v1/models
 ```
 **Simple completion:**
 ```bash
 curl http://localhost:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen2.5-0.5B-Instruct",
    "prompt": "The meaning of life is",
    "max_tokens": 50,
    "temperature": 0.7
  }'
 ```
 **Chat completion:**
 ```bash
 curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen2.5-0.5B-Instruct",
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "What is DGX Spark?"}
    ],
    "max_tokens": 100,
    "temperature": 0.7
  }'
 ```
 **Streaming completion:**
 ```bash
 curl http://localhost:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen2.5-0.5B-Instruct",
    "prompt": "Write a story about",
    "max_tokens": 100,
    "stream": true
  }'
 ```
 ## Tested Models
 These models work well on DGX Spark GB10:
 - `Qwen/Qwen2.5-0.5B-Instruct` (small, fast)
 - `Qwen/Qwen2.5-7B-Instruct` (balanced)
 - `meta-llama/Llama-3.1-8B-Instruct` (high quality)
 - `meta-llama/Llama-3.1-70B-Instruct` (requires tensor parallelism)
 ## Performance Tips
 1. **Use GPU memory efficiently:**
   - Set `gpu_memory_utilization=0.95` for maximum throughput
   - Lower for models close to GPU memory limit
 2. **Batch processing:**
   - Process multiple prompts together
   - vLLM automatically optimizes batch sizes
 3. **Quantization:**
   - For larger models, use quantization:
   ```python
   llm = LLM(model="...", quantization="awq")
   ```
 4. **Tensor parallelism:**
   - For models > 20GB, use multiple GPUs:
   ```python
   llm = LLM(model="...", tensor_parallel_size=2)
   ```
 ## Troubleshooting
 ### Out of Memory
 Reduce `max_model_len` or `gpu_memory_utilization`:
 ```python
 llm = LLM(
    model="...",
    gpu_memory_utilization=0.8,
    max_model_len=2048
 )
 ```
 ### Slow Generation
 Check if model is loaded correctly:
 ```python
 python -c "import vllm; print(vllm.__version__)"
 nvidia-smi  # Check GPU utilization
 ```
 ### Connection Refused (API)
 Ensure server is running:
 ```bash
 cd ~/vllm-install
 ./vllm-status.sh
 ```
 ## More Resources
 - [vLLM Documentation](https://docs.vllm.ai/)
 - [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
 - [Main README](../README.md)
 - [Cluster Setup](../CLUSTER.md)
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 """
 vLLM OpenAI-Compatible API Client Example
 Demonstrates using vLLM's OpenAI-compatible API endpoints
 """
 import requests
 import json
 from typing import Dict, List
 class VLLMClient:
    """Simple client for vLLM OpenAI-compatible API"""
    def __init__(self, base_url: str = "http://localhost:8000"):
        self.base_url = base_url.rstrip('/')
    def list_models(self) -> List[Dict]:
        """List available models"""
        response = requests.get(f"{self.base_url}/v1/models")
        response.raise_for_status()
        return response.json()
    def complete(
        self,
        prompt: str,
        model: str = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        stream: bool = False
    ) -> Dict:
        """Generate completion"""
        # Get model name if not specified
        if model is None:
            models = self.list_models()
            model = models['data'][0]['id']
        payload = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }
        response = requests.post(
            f"{self.base_url}/v1/completions",
            json=payload,
            headers={"Content-Type": "application/json"},
            stream=stream
        )
        response.raise_for_status()
        if stream:
            return response.iter_lines()
        else:
            return response.json()
    def chat(
        self,
        messages: List[Dict[str, str]],
        model: str = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        stream: bool = False
    ) -> Dict:
        """Generate chat completion"""
        # Get model name if not specified
        if model is None:
            models = self.list_models()
            model = models['data'][0]['id']
        payload = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }
        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            json=payload,
            headers={"Content-Type": "application/json"},
            stream=stream
        )
        response.raise_for_status()
        if stream:
            return response.iter_lines()
        else:
            return response.json()
 def main():
    # Initialize client
    client = VLLMClient("http://localhost:8000")
    print("="*60)
    print("vLLM API Client Examples")
    print("="*60)
    # Example 1: List models
    print("\n1. Listing available models...")
    models = client.list_models()
    for model in models['data']:
        print(f"   - {model['id']}")
    # Example 2: Simple completion
    print("\n2. Simple completion...")
    result = client.complete(
        prompt="The capital of France is",
        max_tokens=10,
        temperature=0.0
    )
    print(f"   Prompt: The capital of France is")
    print(f"   Response: {result['choices'][0]['text']}")
    # Example 3: Chat completion
    print("\n3. Chat completion...")
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "What is the Blackwell GPU architecture?"}
    ]
    result = client.chat(
        messages=messages,
        max_tokens=100,
        temperature=0.7
    )
    print(f"   User: {messages[1]['content']}")
    print(f"   Assistant: {result['choices'][0]['message']['content']}")
    # Example 4: Streaming completion
    print("\n4. Streaming completion...")
    print("   Prompt: Write a short poem about AI")
    print("   Response: ", end="", flush=True)
    stream = client.complete(
        prompt="Write a short poem about AI",
        max_tokens=50,
        temperature=0.8,
        stream=True
    )
    for line in stream:
        if line:
            try:
                data = json.loads(line.decode('utf-8').removeprefix('data: '))
                if 'choices' in data and len(data['choices']) > 0:
                    token = data['choices'][0].get('text', '')
                    print(token, end="", flush=True)
            except (json.JSONDecodeError, AttributeError):
                pass
    print("\n")
    print("="*60)
 if __name__ == "__main__":
    main()
--- a/examples/basic_inference.py
+++ b/examples/basic_inference.py
@@ -0,0 +1,48 @@
 #!/usr/bin/env python3
 """
 Basic vLLM Inference Example for DGX Spark
 Demonstrates simple text generation using the vLLM Python API
 """
 from vllm import LLM, SamplingParams
 def main():
    # Initialize the model
    # Use a smaller model for testing, replace with your preferred model
    print("Loading model...")
    llm = LLM(
        model="Qwen/Qwen2.5-0.5B-Instruct",
        trust_remote_code=True,
        gpu_memory_utilization=0.9,
        max_model_len=2048
    )
    # Define prompts
    prompts = [
        "What is the NVIDIA DGX Spark?",
        "Explain the Blackwell GPU architecture in simple terms.",
        "Write a haiku about artificial intelligence."
    ]
    # Configure sampling parameters
    sampling_params = SamplingParams(
        temperature=0.7,
        top_p=0.95,
        max_tokens=100,
        stop=["</s>", "\n\n\n"]
    )
    # Generate responses
    print("\nGenerating responses...\n")
    outputs = llm.generate(prompts, sampling_params)
    # Print results
    for i, output in enumerate(outputs):
        print(f"{'='*60}")
        print(f"Prompt {i+1}: {prompts[i]}")
        print(f"{'-'*60}")
        print(f"Response: {output.outputs[0].text}")
        print(f"{'='*60}\n")
 if __name__ == "__main__":
    main()
--- a/install.sh
+++ b/install.sh
@@ -0,0 +1,777 @@
 #!/bin/bash
 ################################################################################
 # vLLM Installation Script for NVIDIA DGX Spark (Blackwell GB10)
 # Version: 1.1.0
 # Author: DGX Spark Community
 # License: MIT
 #
 # This script automates the complete installation of vLLM on DGX Spark systems
 # with Blackwell GB10 GPUs, including all necessary fixes and optimizations.
 #
 # Usage: ./install.sh [OPTIONS]
 #   Can also be run via: curl -fsSL <url>/install.sh | bash
 #
 # Options:
 #   --install-dir DIR    Installation directory (default: $PWD/vllm-install)
 #   --vllm-version HASH  vLLM git commit (default: 66a168a19 - tested with Blackwell)
 #   --python-version VER Python version (default: 3.12)
 #   --skip-tests         Skip post-installation tests
 #   --help               Show this help message
 ################################################################################
 set -e  # Exit on error
 set -o pipefail  # Catch errors in pipes
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Default configuration
 INSTALL_DIR="$PWD/vllm-install"
 VLLM_VERSION="66a168a197ba214a5b70a74fa2e713c9eeb3251a"  # vLLM commit with Blackwell fixes
 TRITON_VERSION="4caa0328bf8df64896dd5f6fb9df41b0eb2e750a"  # Triton commit that works with Blackwell
 PYTHON_VERSION="3.12"
 SKIP_TESTS=false
 # GitHub raw URL for downloading repo assets when run outside the repo
 REPO_RAW_URL="https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main"
 # Script directory (only meaningful when run from a local clone)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd || echo "")"
 ################################################################################
 # Helper Functions
 ################################################################################
 log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 log_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 print_header() {
    echo ""
    echo -e "${BLUE}========================================${NC}"
    echo -e "${BLUE}$1${NC}"
    echo -e "${BLUE}========================================${NC}"
    echo ""
 }
 check_command() {
    if command -v "$1" &> /dev/null; then
        return 0
    else
        return 1
    fi
 }
 # Auto-confirm when stdin is not a terminal (e.g. curl | bash)
 confirm_or_default_yes() {
    local prompt="$1"
    if [ -t 0 ]; then
        read -p "$prompt (y/N) " -n 1 -r
        echo
        [[ $REPLY =~ ^[Yy]$ ]]
    else
        log_info "Non-interactive mode: auto-confirming"
        return 0
    fi
 }
 ################################################################################
 # Pre-flight Checks
 ################################################################################
 preflight_checks() {
    print_header "Pre-flight System Checks"
    log_info "Checking system requirements..."
    # Check if running on ARM64
    ARCH=$(uname -m)
    if [[ "$ARCH" != "aarch64" ]] && [[ "$ARCH" != "arm64" ]]; then
        log_warning "This script is designed for ARM64 architecture (DGX Spark)"
        log_warning "Detected architecture: $ARCH"
    fi
    # Check for NVIDIA GPU
    if ! check_command nvidia-smi; then
        log_error "nvidia-smi not found. NVIDIA drivers required."
        exit 1
    fi
    # Check GPU type
    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
    log_info "Detected GPU: $GPU_NAME"
    if [[ ! "$GPU_NAME" =~ "GB10" ]]; then
        log_warning "This script is optimized for NVIDIA GB10 (Blackwell)"
        log_warning "Your GPU: $GPU_NAME"
        if ! confirm_or_default_yes "Continue anyway?"; then
            exit 1
        fi
    fi
    # Check CUDA
    if ! check_command nvcc; then
        # Check common CUDA install locations
        if [ -x "/usr/local/cuda/bin/nvcc" ]; then
            export PATH="/usr/local/cuda/bin:$PATH"
            log_info "Found CUDA at /usr/local/cuda, added to PATH"
        else
            log_error "CUDA toolkit not found. Please install CUDA 13.0+"
            exit 1
        fi
    fi
    CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | cut -d',' -f1)
    log_info "CUDA version: $CUDA_VERSION"
    # Check for Python development headers (required for Triton build)
    PYTHON_INCLUDE="/usr/include/python${PYTHON_VERSION}/patchlevel.h"
    if [ ! -f "$PYTHON_INCLUDE" ]; then
        log_warning "Python ${PYTHON_VERSION} development headers not found"
        log_info "Installing python${PYTHON_VERSION}-dev (requires sudo)..."
        if sudo apt-get install -y "python${PYTHON_VERSION}-dev"; then
            log_success "python${PYTHON_VERSION}-dev installed"
        else
            log_error "Failed to install python${PYTHON_VERSION}-dev"
            log_error "Please install manually: sudo apt install python${PYTHON_VERSION}-dev"
            exit 1
        fi
    else
        log_info "Python ${PYTHON_VERSION} development headers found"
    fi
    # Check disk space (need ~50GB)
    AVAILABLE_SPACE=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | sed 's/G//')
    if [[ "$AVAILABLE_SPACE" -lt 50 ]]; then
        log_error "Insufficient disk space. Need at least 50GB, have ${AVAILABLE_SPACE}GB"
        exit 1
    fi
    log_success "Pre-flight checks passed!"
 }
 ################################################################################
 # Install uv Package Manager
 ################################################################################
 install_uv() {
    print_header "Step 1/8: Installing uv Package Manager"
    if check_command uv; then
        UV_VERSION=$(uv --version | awk '{print $2}')
        log_info "uv already installed: v$UV_VERSION"
    else
        log_info "Installing uv..."
        curl -LsSf https://astral.sh/uv/install.sh | sh
        export PATH="$HOME/.local/bin:$PATH"
        log_success "uv installed successfully"
    fi
    # Verify installation
    if ! check_command uv; then
        log_error "uv installation failed"
        exit 1
    fi
 }
 ################################################################################
 # Create Python Virtual Environment
 ################################################################################
 create_venv() {
    print_header "Step 2/8: Creating Python Virtual Environment"
    VENV_DIR="$INSTALL_DIR/.vllm"
    if [ -d "$VENV_DIR" ]; then
        log_warning "Virtual environment already exists at $VENV_DIR"
        if confirm_or_default_yes "Remove and recreate?"; then
            rm -rf "$VENV_DIR"
        else
            log_info "Using existing virtual environment"
            return
        fi
    fi
    log_info "Creating Python $PYTHON_VERSION virtual environment..."
    mkdir -p "$INSTALL_DIR"
    cd "$INSTALL_DIR"
    uv venv .vllm --python "$PYTHON_VERSION"
    # Upgrade setuptools to 77+ so PEP 639 license fields are supported
    # (fixes flashinfer-python build failure)
    log_info "Upgrading setuptools in venv for PEP 639 license support..."
    uv pip install --python "$VENV_DIR/bin/python" --upgrade setuptools
    log_success "Virtual environment created at $VENV_DIR"
 }
 ################################################################################
 # Install PyTorch
 ################################################################################
 install_pytorch() {
    print_header "Step 3/8: Installing PyTorch with CUDA 13.0"
    source "$INSTALL_DIR/.vllm/bin/activate"
    log_info "Installing latest PyTorch for cu130..."
    uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
    # Verify PyTorch installation
    log_info "Verifying PyTorch installation..."
    python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())"
    log_success "PyTorch installed successfully"
 }
 ################################################################################
 # Clone and Build Triton
 ################################################################################
 install_triton() {
    print_header "Step 4/8: Installing Triton from Main Branch"
    TRITON_DIR="$INSTALL_DIR/triton"
    if [ -d "$TRITON_DIR" ]; then
        log_info "Triton directory exists, updating..."
        cd "$TRITON_DIR"
        git fetch
    else
        log_info "Cloning Triton repository..."
        cd "$INSTALL_DIR"
        git clone https://github.com/triton-lang/triton.git
        cd triton
    fi
    log_info "Checking out Triton commit $TRITON_VERSION (tested with Blackwell)..."
    git checkout "$TRITON_VERSION"
    git submodule update --init --recursive
    log_info "Installing Triton build dependencies..."
    source "$INSTALL_DIR/.vllm/bin/activate"
    uv pip install pip cmake ninja pybind11
    log_info "Building Triton (this takes ~5 minutes)..."
    export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
    export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
    python -m pip install --no-build-isolation -v . 2>&1 | tee "$INSTALL_DIR/triton-build.log"
    if [ ${PIPESTATUS[0]} -ne 0 ]; then
        log_error "Triton build failed. See $INSTALL_DIR/triton-build.log for details"
        exit 1
    fi
    # Record the installed triton version so we can protect it later
    TRITON_INSTALLED_VERSION=$(python -c "import triton; print(triton.__version__)" 2>/dev/null || echo "unknown")
    log_info "Triton version installed: $TRITON_INSTALLED_VERSION"
    log_success "Triton installed successfully"
 }
 ################################################################################
 # Install Additional Dependencies
 ################################################################################
 install_dependencies() {
    print_header "Step 5/8: Installing Additional Dependencies"
    source "$INSTALL_DIR/.vllm/bin/activate"
    log_info "Installing xgrammar, setuptools-scm, and apache-tvm-ffi..."
    uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
    log_success "Dependencies installed successfully"
 }
 ################################################################################
 # Clone vLLM
 ################################################################################
 clone_vllm() {
    print_header "Step 6/8: Cloning vLLM Repository"
    VLLM_DIR="$INSTALL_DIR/vllm"
    if [ -d "$VLLM_DIR" ]; then
        log_warning "vLLM directory already exists at $VLLM_DIR"
        if confirm_or_default_yes "Remove and re-clone?"; then
            rm -rf "$VLLM_DIR"
        else
            log_info "Using existing vLLM directory"
            cd "$VLLM_DIR"
            return
        fi
    fi
    log_info "Cloning vLLM $VLLM_VERSION..."
    cd "$INSTALL_DIR"
    git clone --recursive https://github.com/vllm-project/vllm.git
    cd vllm
    git checkout "$VLLM_VERSION"
    git submodule update --init --recursive
    log_success "vLLM repository cloned"
 }
 ################################################################################
 # Apply Critical Fixes
 ################################################################################
 apply_fixes() {
    print_header "Step 7/8: Applying Critical Fixes"
    cd "$INSTALL_DIR/vllm"
    # Fix 1: pyproject.toml license field
    log_info "Fixing pyproject.toml license field..."
    sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' pyproject.toml
    sed -i '/^license-files = /d' pyproject.toml
    # Fix 2: CMakeLists.txt SM100/SM120 MOE kernels (check if already applied)
    if grep -q 'cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"' CMakeLists.txt; then
        log_info "CMakeLists.txt SM100/SM120 fix already applied"
    else
        log_info "Applying CMakeLists.txt SM100/SM120 fix..."
        sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
        sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a"/' CMakeLists.txt
    fi
    # Fix 3: flashinfer-python license field (pre-emptive fix)
    log_info "Pre-fixing flashinfer-python license issue..."
    rm -rf "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" 2>/dev/null || true
    # Fix 4: GPT-OSS Triton MOE kernels for Qwen3/gpt-oss support
    #   Try local repo patches/ first, then download from GitHub
    PATCH_FILE=""
    if [ -f "$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch" ]; then
        PATCH_FILE="$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch"
    else
        log_info "Downloading GPT-OSS Triton MOE patch from repository..."
        PATCH_FILE="$INSTALL_DIR/gpt_oss_triton_moe.patch"
        if curl -fsSL "$REPO_RAW_URL/patches/gpt_oss_triton_moe.patch" -o "$PATCH_FILE" 2>/dev/null; then
            log_info "Patch downloaded successfully"
        else
            PATCH_FILE=""
            log_warning "Could not download GPT-OSS Triton MOE patch (skipping)"
        fi
    fi
    if [ -n "$PATCH_FILE" ] && [ -f "$PATCH_FILE" ]; then
        log_info "Applying GPT-OSS Triton MOE kernel patch for Qwen3/gpt-oss support..."
        if patch --dry-run -p1 < "$PATCH_FILE" > /dev/null 2>&1; then
            patch -p1 < "$PATCH_FILE"
            log_success "GPT-OSS Triton MOE kernel patch applied"
        else
            log_warning "GPT-OSS Triton MOE kernel patch already applied or conflicts"
        fi
    fi
    # Configure use_existing_torch
    log_info "Configuring vLLM to use existing PyTorch..."
    python3 use_existing_torch.py
    log_success "All fixes applied successfully"
 }
 ################################################################################
 # Build and Install vLLM
 ################################################################################
 build_vllm() {
    print_header "Step 8/8: Building vLLM (15-20 minutes)"
    cd "$INSTALL_DIR/vllm"
    source "$INSTALL_DIR/.vllm/bin/activate"
    # Set environment variables
    export TORCH_CUDA_ARCH_LIST=12.1a
    export VLLM_USE_FLASHINFER_MXFP4_MOE=1
    export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
    # Create a constraints file to prevent uv from replacing our
    # custom-built Triton with a PyPI version
    log_info "Creating constraints to protect pinned Triton build..."
    TRITON_CONSTRAINT="$INSTALL_DIR/constraints.txt"
    TRITON_INSTALLED=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "")
    if [ -n "$TRITON_INSTALLED" ]; then
        echo "triton==${TRITON_INSTALLED}" > "$TRITON_CONSTRAINT"
        log_info "Pinning triton==${TRITON_INSTALLED} during vLLM build"
    else
        echo "" > "$TRITON_CONSTRAINT"
        log_warning "Could not detect installed Triton version"
    fi
    log_info "Starting vLLM build..."
    log_warning "This will take 15-20 minutes. Go grab a coffee!"
    set +e  # Don't exit on error, we'll handle it
    UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
        --no-build-isolation --prerelease=allow -e . \
        2>&1 | tee "$INSTALL_DIR/vllm-build.log"
    BUILD_STATUS=${PIPESTATUS[0]}
    set -e
    if [ $BUILD_STATUS -ne 0 ]; then
        if grep -q "flashinfer.*license.*must be valid" "$INSTALL_DIR/vllm-build.log"; then
            log_warning "Build failed due to flashinfer-python license issue"
            log_info "Upgrading setuptools and retrying..."
            # Ensure setuptools is new enough
            uv pip install --upgrade setuptools
            # Also patch the cached flashinfer pyproject.toml as a belt-and-suspenders fix
            find "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" -name "pyproject.toml" 2>/dev/null | while read f; do
                sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' "$f"
                sed -i '/^license-files = /d' "$f"
            done
            log_info "Retrying vLLM build..."
            UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
                --no-build-isolation --prerelease=allow -e .
        else
            log_error "vLLM build failed. See $INSTALL_DIR/vllm-build.log for details"
            exit 1
        fi
    fi
    # Verify Triton wasn't replaced
    TRITON_AFTER=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "unknown")
    if [ -n "$TRITON_INSTALLED" ] && [ "$TRITON_AFTER" != "$TRITON_INSTALLED" ]; then
        log_warning "Triton was changed during vLLM install: $TRITON_INSTALLED -> $TRITON_AFTER"
        log_warning "Rebuilding pinned Triton from source..."
        cd "$INSTALL_DIR/triton"
        git checkout "$TRITON_VERSION"
        export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
        python -m pip install --no-build-isolation --force-reinstall -v .
        cd "$INSTALL_DIR/vllm"
    fi
    log_success "vLLM built successfully!"
 }
 ################################################################################
 # Create Helper Scripts
 ################################################################################
 create_helper_scripts() {
    print_header "Creating Helper Scripts"
    # Create environment activation script
    log_info "Creating vllm_env.sh..."
    cat > "$INSTALL_DIR/vllm_env.sh" << 'ENVEOF'
 #!/bin/bash
 # vLLM Environment Configuration for DGX Spark
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "$SCRIPT_DIR/.vllm/bin/activate"
 export TORCH_CUDA_ARCH_LIST=12.1a
 export VLLM_USE_FLASHINFER_MXFP4_MOE=1
 CUDA_PATH=$(ls -d /usr/local/cuda* 2>/dev/null | head -1)
 export TRITON_PTXAS_PATH="$CUDA_PATH/bin/ptxas"
 export PATH="$CUDA_PATH/bin:$PATH"
 export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
 # Cache tiktoken encodings to avoid re-downloading
 export TIKTOKEN_CACHE_DIR="$SCRIPT_DIR/.tiktoken_cache"
 mkdir -p "$TIKTOKEN_CACHE_DIR"
 echo "=== vLLM Environment Active ==="
 echo "Virtual env: $VIRTUAL_ENV"
 echo "CUDA arch: $TORCH_CUDA_ARCH_LIST"
 echo "Python: $(which python)"
 echo "==============================="
 ENVEOF
    chmod +x "$INSTALL_DIR/vllm_env.sh"
    # Create vllm-serve.sh (embedded so it works with curl|bash)
    log_info "Creating vllm-serve.sh..."
    cat > "$INSTALL_DIR/vllm-serve.sh" << 'SERVEEOF'
 #!/bin/bash
 # vLLM Server Startup Script for DGX Spark
 # Usage: ./vllm-serve.sh <model_name> [port]
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
 PORT="${2:-8000}"
 VLLM_DIR="$SCRIPT_DIR/vllm"
 ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 LOG_FILE="$SCRIPT_DIR/vllm-server.log"
 # Check if server is already running
 if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null 2>&1; then
        echo "ERROR: vLLM server is already running (PID: $PID)"
        echo "Use ./vllm-stop.sh to stop it first"
        exit 1
    fi
 fi
 # Source environment
 source "$ENV_SCRIPT"
 echo "----------------------------------------------------------------------"
 echo "Starting vLLM Server on DGX Spark"
 echo "----------------------------------------------------------------------"
 echo "Model: $MODEL"
 echo "Port: $PORT"
 echo "Log file: $LOG_FILE"
 echo "PID file: $PID_FILE"
 echo "----------------------------------------------------------------------"
 # Start server in background
 cd "$VLLM_DIR"
 nohup python -m vllm.entrypoints.openai.api_server \
    --model "$MODEL" \
    --trust-remote-code \
    --host 0.0.0.0 \
    --port "$PORT" \
    --gpu-memory-utilization 0.9 \
    > "$LOG_FILE" 2>&1 &
 echo $! > "$PID_FILE"
 echo "OK: Server started with PID: $(cat $PID_FILE)"
 echo "OK: Waiting for server to be ready..."
 sleep 5
 if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
    echo "OK: Server is running!"
    echo ""
    echo "Test with: curl http://localhost:$PORT/v1/models"
    echo "View logs: tail -f $LOG_FILE"
    echo "Stop server: ./vllm-stop.sh"
 else
    echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
    rm -f "$PID_FILE"
    exit 1
 fi
 SERVEEOF
    chmod +x "$INSTALL_DIR/vllm-serve.sh"
    # Create vllm-stop.sh
    log_info "Creating vllm-stop.sh..."
    cat > "$INSTALL_DIR/vllm-stop.sh" << 'STOPEOF'
 #!/bin/bash
 # vLLM Server Stop Script for DGX Spark
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 if [ ! -f "$PID_FILE" ]; then
    echo "No vLLM server PID file found. Server may not be running."
    exit 0
 fi
 PID=$(cat "$PID_FILE")
 if ! ps -p $PID > /dev/null 2>&1; then
    echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
    rm -f "$PID_FILE"
    exit 0
 fi
 echo "Stopping vLLM server (PID: $PID)..."
 kill $PID
 for i in {1..10}; do
    if ! ps -p $PID > /dev/null 2>&1; then
        echo "OK: Server stopped successfully"
        rm -f "$PID_FILE"
        exit 0
    fi
    sleep 1
 done
 if ps -p $PID > /dev/null 2>&1; then
    echo "Server did not stop gracefully. Force killing..."
    kill -9 $PID
    sleep 1
    if ! ps -p $PID > /dev/null 2>&1; then
        echo "OK: Server force stopped"
        rm -f "$PID_FILE"
    else
        echo "ERROR: Failed to stop server"
        exit 1
    fi
 fi
 STOPEOF
    chmod +x "$INSTALL_DIR/vllm-stop.sh"
    # Create vllm-status.sh
    log_info "Creating vllm-status.sh..."
    cat > "$INSTALL_DIR/vllm-status.sh" << 'STATUSEOF'
 #!/bin/bash
 # vLLM Server Status Script for DGX Spark
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 LOG_FILE="$SCRIPT_DIR/vllm-server.log"
 echo "----------------------------------------------------------------------"
 echo "vLLM Server Status on DGX Spark"
 echo "----------------------------------------------------------------------"
 if [ ! -f "$PID_FILE" ]; then
    echo "Status: NOT RUNNING (no PID file found)"
    exit 0
 fi
 PID=$(cat "$PID_FILE")
 if ! ps -p $PID > /dev/null 2>&1; then
    echo "Status: NOT RUNNING (stale PID file)"
    echo "Cleaning up PID file..."
    rm -f "$PID_FILE"
    exit 0
 fi
 echo "Status: RUNNING"
 echo "PID: $PID"
 echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
 echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
 echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
 echo ""
 if [ -f "$LOG_FILE" ]; then
    echo "Recent log entries (last 10 lines):"
    echo "----------------------------------------------------------------------"
    tail -n 10 "$LOG_FILE"
 else
    echo "Log file not found: $LOG_FILE"
 fi
 echo ""
 echo "----------------------------------------------------------------------"
 STATUSEOF
    chmod +x "$INSTALL_DIR/vllm-status.sh"
    log_success "Helper scripts created in $INSTALL_DIR"
 }
 ################################################################################
 # Post-Installation Tests
 ################################################################################
 run_tests() {
    if [ "$SKIP_TESTS" = true ]; then
        log_info "Skipping post-installation tests"
        return
    fi
    print_header "Post-Installation Tests"
    source "$INSTALL_DIR/vllm_env.sh"
    log_info "Test 1: Import vLLM..."
    python -c "import vllm; print('vLLM version:', vllm.__version__)"
    log_info "Test 2: Check CUDA availability..."
    python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print('CUDA available')"
    log_info "Test 3: Check GPU detection..."
    python -c "import torch; print('GPU count:', torch.cuda.device_count()); print('GPU name:', torch.cuda.get_device_name(0))"
    log_success "All tests passed!"
 }
 ################################################################################
 # Parse Command Line Arguments
 ################################################################################
 parse_args() {
    while [[ $# -gt 0 ]]; do
        case $1 in
            --install-dir)
                INSTALL_DIR="$2"
                shift 2
                ;;
            --vllm-version)
                VLLM_VERSION="$2"
                shift 2
                ;;
            --python-version)
                PYTHON_VERSION="$2"
                shift 2
                ;;
            --skip-tests)
                SKIP_TESTS=true
                shift
                ;;
            --help)
                head -20 "$0" | grep "^#" | sed 's/^# //'
                exit 0
                ;;
            *)
                log_error "Unknown option: $1"
                log_info "Use --help for usage information"
                exit 1
                ;;
        esac
    done
 }
 ################################################################################
 # Main Installation Flow
 ################################################################################
 main() {
    parse_args "$@"
    print_header "vLLM Installation for DGX Spark (Blackwell GB10)"
    log_info "Installation directory: $INSTALL_DIR"
    log_info "vLLM version: $VLLM_VERSION"
    log_info "Python version: $PYTHON_VERSION"
    echo ""
    preflight_checks
    install_uv
    create_venv
    install_pytorch
    install_triton
    install_dependencies
    clone_vllm
    apply_fixes
    build_vllm
    create_helper_scripts
    run_tests
    print_header "Installation Complete!"
    echo ""
    log_success "vLLM has been successfully installed!"
    echo ""
    echo -e "${GREEN}Next steps:${NC}"
    echo "1. Activate the environment:"
    echo "   ${BLUE}source $INSTALL_DIR/vllm_env.sh${NC}"
    echo ""
    echo "2. Start vLLM server:"
    echo "   ${BLUE}cd $INSTALL_DIR${NC}"
    echo "   ${BLUE}./vllm-serve.sh${NC}"
    echo ""
    echo "3. Test the API:"
    echo "   ${BLUE}curl http://localhost:8000/v1/models${NC}"
    echo ""
    echo "For more information, see README.md"
    echo ""
 }
 # Run main function
 main "$@"
--- a/patches/gpt_oss_triton_moe.patch
+++ b/patches/gpt_oss_triton_moe.patch
@@ -0,0 +1,77 @@
 diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
 index badedfc54..e05c0eea4 100644
 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
 +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -20,9 +20,16 @@ logger = init_logger(__name__)
 if has_triton_kernels():
     try:
         import triton_kernels.swiglu
 -        from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
 -        from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
 -        from triton_kernels.tensor import Bitmatrix
 +        from triton_kernels.matmul_ogs import (
 +            FnSpecs,
 +            FusedActivation,
 +            GatherIndx,
 +            RoutingData,
 +            ScatterIndx,
 +            matmul_ogs,
 +        )
 +        from triton_kernels.tensor import BIT, Bitmatrix, SparseMatrix, make_ragged_tensor_metadata
 +        from triton_kernels.topk import topk as triton_topk
     except (AttributeError, ImportError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
@@ -84,8 +91,17 @@ def triton_kernel_moe_forward(
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
 -    routing_data, gather_idx, scatter_idx = routing(
 -        gating_output, topk, sm_first=not renormalize
 +    # Use new topk API instead of deprecated routing
 +    sm_first = not renormalize
 +    if sm_first:
 +        gating_output = torch.softmax(gating_output, dim=-1)
 +    sparse_logits = triton_topk(
 +        gating_output, topk, apply_softmax=not sm_first, y_indx=None, n_rows=None
 +    )
 +
 +    # Convert to routing data using the existing make_routing_data function
 +    routing_data, gather_idx, scatter_idx = make_routing_data(
 +        sparse_logits.indx, sparse_logits.vals, gating_output.shape[-1]
     )
     return triton_kernel_fused_experts(
@@ -202,14 +218,29 @@ def make_routing_data(
     bitmatrix_shape = [n_rows, bm_cols * 32]
     bitmatrix_shape_max = [n_rows, None]
     bitmatrix = Bitmatrix(
 -        bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
 +        bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
     )
     # matmul_ogs expects invalid topk_weights to be -1s
     topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
 -    routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
 -        bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
 +
 +    # Use new SparseMatrix API instead of deprecated routing_from_bitmatrix
 +    sparse_logits = SparseMatrix(indx=topk_ids, vals=topk_weights, mask=bitmatrix)
 +    dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
 +    combine_indx = sparse_logits.mask_metadata.row_sorted_indx
 +    ragged_batch_metadata = make_ragged_tensor_metadata(
 +        sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0]
 +    )
 +    gate_scal = sparse_logits.vals.flatten()[combine_indx]
 +    routing_data = RoutingData(
 +        gate_scal,
 +        ragged_batch_metadata.block_sizes,
 +        num_local_experts,
 +        num_topk,
 +        ragged_batch_metadata,
     )
 +    gather_indx = GatherIndx(combine_indx, dispatch_indx)
 +    scatter_indx = ScatterIndx(dispatch_indx, combine_indx)
     return routing_data, gather_indx, scatter_indx
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,28 @@
 # Core Dependencies for vLLM on DGX Spark (Blackwell GB10)
 # Note: This file is for reference only. The install.sh script handles
 # all dependency installation with proper ordering and build flags.
 # PyTorch with CUDA 13.0 support (installs latest available on cu130 index)
 --index-url https://download.pytorch.org/whl/cu130
 torch
 torchvision
 torchaudio
 # Triton (must be built from source - see install.sh)
 # Pinned to commit 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
 # triton @ git+https://github.com/triton-lang/triton.git@4caa0328
 # vLLM dependencies
 xgrammar>=0.1.26
 setuptools-scm>=9.2.2
 setuptools>=77.0.0  # Required for PEP 639 license field support
 apache-tvm-ffi==0.1.0b15  # Pre-release required
 # Build dependencies
 pybind11>=3.0.0
 ninja>=1.13.0
 # Optional but recommended
 flashinfer-python>=0.4.1
 transformers>=4.57.0
 huggingface-hub>=0.36.0
--- a/scripts/vllm-serve.sh
+++ b/scripts/vllm-serve.sh
@@ -0,0 +1,67 @@
 #!/bin/bash
 # vLLM Server Startup Script for DGX Spark
 # Usage: ./vllm-serve.sh <model_name> [port]
 set -e
 # Determine installation directory (where this script is located)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Configuration
 MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
 PORT="${2:-8000}"
 VLLM_DIR="$SCRIPT_DIR/vllm"
 ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 LOG_FILE="$SCRIPT_DIR/vllm-server.log"
 # Check if server is already running
 if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p $PID > /dev/null 2>&1; then
        echo "ERROR: vLLM server is already running (PID: $PID)"
        echo "Use ./vllm-stop.sh to stop it first"
        exit 1
    fi
 fi
 # Source environment
 source "$ENV_SCRIPT"
 echo "=" | tr '=' '-' | head -c 70 && echo
 echo "Starting vLLM Server on DGX Spark"
 echo "=" | tr '=' '-' | head -c 70 && echo
 echo "Model: $MODEL"
 echo "Port: $PORT"
 echo "Log file: $LOG_FILE"
 echo "PID file: $PID_FILE"
 echo "=" | tr '=' '-' | head -c 70 && echo
 # Start server in background
 cd "$VLLM_DIR"
 nohup python -m vllm.entrypoints.openai.api_server \
    --model "$MODEL" \
    --trust-remote-code \
    --host 0.0.0.0 \
    --port "$PORT" \
    --gpu-memory-utilization 0.9 \
    > "$LOG_FILE" 2>&1 &
 # Save PID
 echo $! > "$PID_FILE"
 echo "OK: Server started with PID: $(cat $PID_FILE)"
 echo "OK: Waiting for server to be ready..."
 # Wait for server to be ready
 sleep 5
 if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
    echo "OK: Server is running!"
    echo ""
    echo "Test with: curl http://localhost:$PORT/v1/models"
    echo "View logs: tail -f $LOG_FILE"
    echo "Stop server: ./vllm-stop.sh"
 else
    echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
    rm -f "$PID_FILE"
    exit 1
 fi
--- a/scripts/vllm-status.sh
+++ b/scripts/vllm-status.sh
@@ -0,0 +1,45 @@
 #!/bin/bash
 # vLLM Server Status Script for DGX Spark
 # Determine installation directory (where this script is located)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 LOG_FILE="$SCRIPT_DIR/vllm-server.log"
 echo "=" | tr '=' '-' | head -c 70 && echo
 echo "vLLM Server Status on DGX Spark"
 echo "=" | tr '=' '-' | head -c 70 && echo
 if [ ! -f "$PID_FILE" ]; then
    echo "Status: NOT RUNNING (no PID file found)"
    exit 0
 fi
 PID=$(cat "$PID_FILE")
 if ! ps -p $PID > /dev/null 2>&1; then
    echo "Status: NOT RUNNING (stale PID file)"
    echo "Cleaning up PID file..."
    rm -f "$PID_FILE"
    exit 0
 fi
 echo "Status: RUNNING"
 echo "PID: $PID"
 echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
 echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
 echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
 echo ""
 # Check if log file exists and show last few lines
 if [ -f "$LOG_FILE" ]; then
    echo "Recent log entries (last 10 lines):"
    echo "-" | tr '-' '-' | head -c 70 && echo
    tail -n 10 "$LOG_FILE"
 else
    echo "Log file not found: $LOG_FILE"
 fi
 echo ""
 echo "=" | tr '=' '-' | head -c 70 && echo
--- a/scripts/vllm-stop.sh
+++ b/scripts/vllm-stop.sh
@@ -0,0 +1,47 @@
 #!/bin/bash
 # vLLM Server Stop Script for DGX Spark
 # Determine installation directory (where this script is located)
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
 if [ ! -f "$PID_FILE" ]; then
    echo "No vLLM server PID file found. Server may not be running."
    exit 0
 fi
 PID=$(cat "$PID_FILE")
 if ! ps -p $PID > /dev/null 2>&1; then
    echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
    rm -f "$PID_FILE"
    exit 0
 fi
 echo "Stopping vLLM server (PID: $PID)..."
 kill $PID
 # Wait for process to terminate
 for i in {1..10}; do
    if ! ps -p $PID > /dev/null 2>&1; then
        echo "OK: Server stopped successfully"
        rm -f "$PID_FILE"
        exit 0
    fi
    sleep 1
 done
 # Force kill if still running
 if ps -p $PID > /dev/null 2>&1; then
    echo "Server did not stop gracefully. Force killing..."
    kill -9 $PID
    sleep 1
    if ! ps -p $PID > /dev/null 2>&1; then
        echo "OK: Server force stopped"
        rm -f "$PID_FILE"
    else
        echo "ERROR: Failed to stop server"
        exit 1
    fi
 fi