commit c05cb718168df70b52a74fb6d7e6085d344235ce Author: Thomas Nilles Date: Sun Mar 22 17:26:26 2026 -0400 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c4e3104 --- /dev/null +++ b/.gitignore @@ -0,0 +1,77 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environments +.venv +.vllm/ +venv/ +ENV/ +env/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log +vllm-server.log +*.out +*.err + +# Build artifacts +*.o +*.a +*.so +*.dylib +*.dll +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +Makefile + +# CUDA +*.ptx +*.cubin + +# Local installation directories +triton/ +vllm/ +.cache/ + +# Temporary files +tmp/ +temp/ +*.tmp +*.bak + +# Model downloads +models/ +*.safetensors +*.bin +*.gguf diff --git a/CLUSTER.md b/CLUSTER.md new file mode 100644 index 0000000..cc74a3f --- /dev/null +++ b/CLUSTER.md @@ -0,0 +1,380 @@ +# vLLM Cluster Mode Setup for DGX Spark + +This guide covers setting up multi-node vLLM deployment on DGX Spark systems using distributed inference. + +## Prerequisites + +- Multiple DGX Spark systems with vLLM installed (use `install.sh` on each node) +- All nodes on the same network with direct connectivity +- SSH access between nodes (passwordless SSH recommended) +- Same CUDA and vLLM versions across all nodes + +## Architecture + +``` +┌─────────────────────┐ +│ spark-alpha │ +│ (Master/Head) │ +│ - API Server │ +│ - Request Router │ +│ - Model Weights │ +└──────────┬──────────┘ + │ + ├─────────────────────┐ + │ │ +┌──────────▼──────────┐ ┌──────▼──────────┐ +│ spark-omega │ │ spark-gamma │ +│ (Worker 1) │ │ (Worker 2) │ +│ - Inference │ │ - Inference │ +│ - GPU Compute │ │ - GPU Compute │ +└─────────────────────┘ └─────────────────┘ +``` + +## Step 1: Install vLLM on All Nodes + +Run the installer on each node: + +```bash +# On spark-alpha (master) +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash + +# On spark-omega (worker 1) +ssh spark-omega.local +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash + +# On spark-gamma (worker 2) +ssh spark-gamma.local +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash +``` + +## Step 2: Configure Network Settings + +Ensure all nodes can communicate on the required ports: + +- **8000**: vLLM API server (master only) +- **29500**: PyTorch distributed backend (all nodes) +- **Random ports**: Ray cluster communication + +Open firewall if needed: + +```bash +# On all nodes +sudo ufw allow 8000/tcp +sudo ufw allow 29500/tcp +sudo ufw allow 6379/tcp # Ray GCS +sudo ufw allow 8265/tcp # Ray Dashboard +``` + +## Step 3: Set Up Passwordless SSH (Optional but Recommended) + +```bash +# On master node +ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" + +# Copy to worker nodes +ssh-copy-id spark-omega.local +ssh-copy-id spark-gamma.local + +# Verify +ssh spark-omega.local "echo 'Connection successful'" +ssh spark-gamma.local "echo 'Connection successful'" +``` + +## Step 4: Start Ray Cluster + +### On Master Node (spark-alpha) + +```bash +# Assuming vllm-install is in your home directory +source ~/vllm-install/vllm_env.sh + +# Start Ray head node +ray start --head \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 \ + --num-gpus=1 + +# Note the output: "To connect to this Ray cluster, use: ray start --address='MASTER_IP:6379'" +``` + +### On Worker Nodes (spark-omega, spark-gamma) + +```bash +source ~/vllm-install/vllm_env.sh + +# Replace MASTER_IP with spark-alpha's IP address +ray start --address='MASTER_IP:6379' --num-gpus=1 +``` + +Verify cluster status: + +```bash +ray status +``` + +You should see all nodes listed. + +## Step 5: Start vLLM with Tensor Parallelism + +### Method 1: Tensor Parallelism (Recommended for Large Models) + +Tensor parallelism splits model layers across multiple GPUs. + +```bash +# On master node +source ~/vllm-install/vllm_env.sh + +vllm serve \ + --model "meta-llama/Llama-3.1-70B-Instruct" \ + --tensor-parallel-size 2 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 8000 +``` + +This will automatically distribute the model across 2 GPUs in the Ray cluster. + +### Method 2: Pipeline Parallelism + +Pipeline parallelism splits model stages across GPUs. + +```bash +vllm serve \ + --model "meta-llama/Llama-3.1-70B-Instruct" \ + --pipeline-parallel-size 2 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 8000 +``` + +### Method 3: Combined Parallelism + +For very large models, combine tensor and pipeline parallelism: + +```bash +vllm serve \ + --model "meta-llama/Llama-3.1-405B-Instruct" \ + --tensor-parallel-size 4 \ + --pipeline-parallel-size 2 \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port 8000 +``` + +## Step 6: Test Cluster Inference + +```bash +# Test from master node +curl http://localhost:8000/v1/models + +# Test from external machine +curl http://spark-alpha.local:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-70B-Instruct", + "prompt": "Explain distributed inference in 3 sentences.", + "max_tokens": 100, + "temperature": 0.7 + }' +``` + +## Step 7: Monitor Cluster + +### Ray Dashboard + +Access at: http://spark-alpha.local:8265 + +Shows: +- Node status and resources +- Task execution +- GPU utilization +- Memory usage + +### vLLM Metrics + +```bash +# On master node +tail -f ~/vllm-install/vllm-server.log + +# Check GPU usage across cluster +ray exec 'nvidia-smi' +``` + +### System Monitoring + +```bash +# Check Ray cluster status +ray status + +# Monitor GPU usage on specific node +ssh spark-omega.local nvidia-smi -l 1 +``` + +## Troubleshooting + +### Workers Not Connecting + +**Problem**: Workers can't connect to Ray head node + +**Solutions**: +1. Check firewall: `sudo ufw status` +2. Verify head node IP: `ray status` on master +3. Check network connectivity: `ping spark-alpha.local` +4. Ensure same Ray version on all nodes: `ray --version` + +### OOM Errors with Large Models + +**Problem**: Out of memory when loading large models + +**Solutions**: +1. Increase tensor parallelism: `--tensor-parallel-size 4` +2. Reduce memory utilization: `--gpu-memory-utilization 0.8` +3. Enable CPU offloading: `--cpu-offload-gb 8` +4. Use quantization: `--quantization awq` or `--quantization gptq` + +### Model Loading Hangs + +**Problem**: Model download/loading takes forever + +**Solutions**: +1. Pre-download model on all nodes: + ```bash + # On each node + python -c "from transformers import AutoModel; AutoModel.from_pretrained('meta-llama/Llama-3.1-70B-Instruct')" + ``` +2. Use shared storage (NFS) for model cache +3. Check network bandwidth between nodes + +### Uneven GPU Utilization + +**Problem**: Some GPUs idle while others maxed out + +**Solutions**: +1. Verify tensor parallel configuration +2. Check Ray resource allocation: `ray status` +3. Ensure balanced request distribution +4. Monitor with: `ray exec 'nvidia-smi'` + +## Advanced Configuration + +### Custom Ray Resources + +Assign custom resources to nodes for fine-grained control: + +```bash +# On worker with high memory +ray start --address='MASTER_IP:6379' \ + --num-gpus=1 \ + --resources='{"highmem": 1}' + +# Use in vLLM +vllm serve --model "..." --placement-group-resources='{"highmem": 1}' +``` + +### Distributed Model Cache + +Share model weights via NFS to avoid redundant downloads: + +```bash +# On NFS server (e.g., master) +sudo apt install nfs-kernel-server +echo "$HOME/.cache/huggingface *(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports +sudo exportfs -a + +# On workers +sudo apt install nfs-common +sudo mkdir -p $HOME/.cache/huggingface +sudo mount spark-alpha.local:$HOME/.cache/huggingface $HOME/.cache/huggingface +``` + +### Load Balancing with nginx + +For production deployments, use nginx to load balance across multiple vLLM instances: + +```nginx +upstream vllm_cluster { + least_conn; + server spark-alpha.local:8000; + server spark-omega.local:8000; + server spark-gamma.local:8000; +} + +server { + listen 80; + location / { + proxy_pass http://vllm_cluster; + proxy_set_header Host $host; + } +} +``` + +## Cluster Management Scripts + +### Start Cluster + +Create `start-cluster.sh`: + +```bash +#!/bin/bash +# Start Ray cluster on all nodes + +ssh spark-alpha.local "source ~/vllm-install/vllm_env.sh && ray start --head --port=6379" +sleep 5 + +MASTER_IP=$(ssh spark-alpha.local "hostname -I | awk '{print \$1}'") + +ssh spark-omega.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'" +ssh spark-gamma.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'" + +echo "Cluster started. Check status with: ray status" +``` + +### Stop Cluster + +Create `stop-cluster.sh`: + +```bash +#!/bin/bash +# Stop Ray cluster on all nodes + +for node in spark-alpha.local spark-omega.local spark-gamma.local; do + echo "Stopping Ray on $node..." + ssh $node "ray stop --force" +done + +echo "Cluster stopped." +``` + +## Performance Tuning + +### For Maximum Throughput + +```bash +vllm serve \ + --model "meta-llama/Llama-3.1-70B-Instruct" \ + --tensor-parallel-size 2 \ + --max-num-seqs 256 \ + --max-num-batched-tokens 8192 \ + --gpu-memory-utilization 0.95 +``` + +### For Low Latency + +```bash +vllm serve \ + --model "meta-llama/Llama-3.1-70B-Instruct" \ + --tensor-parallel-size 2 \ + --max-num-seqs 32 \ + --disable-log-requests +``` + +## References + +- [vLLM Distributed Inference](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) +- [Ray Cluster Setup](https://docs.ray.io/en/latest/cluster/getting-started.html) +- [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) + +## Support + +For issues specific to DGX Spark cluster setup, please open an issue on GitHub. diff --git a/CRITICAL_FIX_ANALYSIS.md b/CRITICAL_FIX_ANALYSIS.md new file mode 100644 index 0000000..fc3d0d7 --- /dev/null +++ b/CRITICAL_FIX_ANALYSIS.md @@ -0,0 +1,134 @@ +# Critical Blackwell GB10 Fixes for vLLM + +## Overview + +Three critical fixes are required for vLLM on Blackwell GB10 (sm_121a) GPUs with CUDA 13.0+: + +1. **CMakeLists.txt SM120 Support** - Add missing architecture +2. **vLLM Commit Version** - Use commit with Blackwell/Triton fixes +3. **Triton Version Pinning** - Use tested working commit + +## Fix 1: CMakeLists.txt SM120 Support + +### Root Cause + +vLLM v0.11.1rc3 CMakeLists.txt has **incomplete architecture support** for Blackwell GB10 (sm_121a) MOE kernels when using CUDA 13.0+. + +## The Problem + +For CUDA 13.0+, the code uses these branches: +- **Line 490**: Regular MOE kernels +- **Line 671**: Grouped MM MOE kernels + +Original v0.11.1rc3: +```cmake +# Line 490 +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + +# Line 671 +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") +``` + +**BOTH lines are missing `12.0f` (SM120) support!** + +## The Fix + +Both lines need `12.0f` added: +```cmake +# Line 490 +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + +# Line 671 +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") +``` + +## Error Symptoms + +Without this fix: +``` +ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100RN2at6TensorERKS0_S3_S3_S3_S3_S3_S3_S3_S3_bb +``` + +The MOE kernels for SM100/SM120 aren't compiled, causing import failures. + +## Why install.sh Works + +The sed command on line 323: +```bash +sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt +``` + +This replaces **ALL** occurrences, fixing both lines 490 and 671 in one command. + +## Verified Solution + +Tested on NVIDIA DGX Spark with Blackwell GB10, CUDA 13.0: +- [OK] Line 490 fixed: `"10.0f;11.0f;12.0f"` +- [OK] Line 671 fixed: `"10.0f;11.0f;12.0f"` +- [OK] vLLM imports successfully +- [OK] No cutlass_moe_mm_sm100 symbol errors +- [OK] Build time: ~19 minutes + +## Fix 2: vLLM Commit Version + +### Issue + +vLLM tag `v0.11.1rc3` lacks critical Triton/PyTorch Inductor fixes for Blackwell. + +### Solution + +Use commit `66a168a197ba214a5b70a74fa2e713c9eeb3251a` (6 commits ahead of v0.11.1rc3): +- Contains Triton JIT compilation fixes +- Includes PyTorch Inductor optimizations for Blackwell +- Adds proper backend registration handling + +### Installation + +```bash +cd vllm +git checkout 66a168a197ba214a5b70a74fa2e713c9eeb3251a +git submodule update --init --recursive +``` + +## Fix 3: Triton Version Pinning + +### Issue + +Latest Triton main branch (as of late October 2025) has intermittent JITFunction compilation issues with PyTorch Inductor on Blackwell. + +### Solution + +Pin to tested working commit: `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` (October 25, 2025) +- Verified stable with Blackwell GB10 +- Passes all compilation tests +- No JITFunction.constexprs errors + +### Installation + +```bash +cd triton +git checkout 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a +git submodule update --init --recursive +python -m pip install --no-build-isolation -v . +``` + +## Complete Verified Configuration + +| Component | Version/Commit | Notes | +|-----------|---------------|-------| +| **vLLM** | `66a168a197ba214a5b70a74fa2e713c9eeb3251a` | 6 commits ahead of v0.11.1rc3 | +| **Triton** | `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` | October 25, 2025 | +| **PyTorch** | `2.9.0+cu130` | From vLLM requirements | +| **CUDA** | `13.0` (V13.0.88) | System CUDA | +| **Python** | `3.12.3` | | + +## Testing + +Verified working with: +```bash +python -c "from vllm import LLM, SamplingParams; \ +llm = LLM(model='Qwen/Qwen2.5-0.5B-Instruct', max_model_len=512); \ +print(llm.generate(['Hello'], SamplingParams(max_tokens=20)))" +``` + +**All tests pass**: Import, compilation, CUDA graphs, and text generation all work correctly. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a967cbb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 DGX Spark Community + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9a47b4 --- /dev/null +++ b/README.md @@ -0,0 +1,312 @@ +# vLLM Setup for NVIDIA DGX Spark (Blackwell GB10) + +**One-command installation** of vLLM for NVIDIA DGX Spark systems with GB10 GPUs (Blackwell architecture, sm_121). + +This repository provides a dgx-spark tested, ready setup script that handles all the complexities of building vLLM on the DGX Spark platform, including: +- CUDA 13.0 support with Blackwell-specific optimizations +- Critical fixes for SM100/SM120 MOE kernel compilation +- Triton 3.5.0 from main branch (required for sm_121a support) +- PyTorch 2.9.0 with CUDA 13.0 bindings +- All necessary build fixes and workarounds + +## Quick Start + +**One-command installation** - installs to `./vllm-install` in your current directory: + +```bash +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash +``` + +Or specify a custom directory: + +```bash +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash -s -- --install-dir ~/my/custom/path +``` + +**Installation time:** ~20-30 minutes (mostly compilation) + +### Alternative: Clone and Install + +```bash +git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git +cd dgx-spark-vllm-setup +./install.sh +``` + +### Installation Options + +```bash +./install.sh [OPTIONS] + +Options: + --install-dir DIR Installation directory (default: ./vllm-install) + --vllm-version TAG vLLM git tag/branch (default: v0.11.1rc3) + --python-version VER Python version (default: 3.12) + --skip-tests Skip post-installation tests + --help Show help message +``` + +## System Requirements + +- **Hardware:** NVIDIA DGX Spark with GB10 GPU (Blackwell sm_121) +- **OS:** Ubuntu 22.04+ (tested on Linux 6.11.0 ARM64) +- **CUDA:** 13.0 or later (driver 580.95.05+) +- **Disk Space:** ~50GB free +- **RAM:** 8GB+ recommended during build + +## What Gets Installed + +Installed to `./vllm-install` (or your custom directory): + +- **Python 3.12** virtual environment at `.vllm/` +- **PyTorch 2.9.0+cu130** with full CUDA 13.0 support +- **Triton 3.5.0+git** from main branch (pre-release with Blackwell support) +- **vLLM 0.11.1rc3+** with all Blackwell-specific patches +- **Helper scripts** for managing vLLM server +- **Environment activation** script (`vllm_env.sh`) + +## Usage + +All examples assume you're in the installation directory (default: `./vllm-install`). + +### Activate Environment + +```bash +cd vllm-install +source vllm_env.sh +``` + +### Start vLLM Server + +```bash +./vllm-serve.sh # Default: Qwen2.5-0.5B on port 8000 +./vllm-serve.sh "facebook/opt-125m" 8001 # Custom model and port +``` + +### Check Server Status + +```bash +./vllm-status.sh +``` + +### Stop Server + +```bash +./vllm-stop.sh +``` + +### Test API + +```bash +# List models +curl http://localhost:8000/v1/models + +# Generate completion +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "Hello, how are you?", + "max_tokens": 50 + }' +``` + +### Python API + +```python +from vllm import LLM, SamplingParams + +llm = LLM( + model="Qwen/Qwen2.5-0.5B-Instruct", + trust_remote_code=True, + gpu_memory_utilization=0.9 +) + +prompts = ["Tell me about DGX Spark"] +sampling_params = SamplingParams(temperature=0.7, max_tokens=100) +outputs = llm.generate(prompts, sampling_params) + +print(outputs[0].outputs[0].text) +``` + +## Critical Fixes Applied + +This installer automatically applies the following critical fixes: + +### 1. CMakeLists.txt SM100/SM120 MOE Kernel Fix + +**Issue:** vLLM's MOE kernels for SM100/SM120 Blackwell architectures were incomplete +**Fix:** Added `12.0f` and `12.1a` to SCALED_MM_ARCHS in CMakeLists.txt + +```cmake +# CUDA 13.0+ path (line ~671) +# Before +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") +# After +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + +# Older CUDA path (line ~673) +# Before +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") +# After +cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a" "${CUDA_ARCHS}") +``` + +### 2. pyproject.toml License Field Format + +**Issue:** Newer setuptools requires structured license format +**Fix:** Convert license string to dict format in both vLLM and flashinfer-python + +```toml +# Before +license = "Apache-2.0" +license-files = ["LICENSE"] + +# After +license = {text = "Apache-2.0"} +``` + +**Applied to:** +- vLLM's pyproject.toml +- flashinfer-python's pyproject.toml (patched during build) + +### 3. GPT-OSS Triton MOE Kernels for Qwen3/gpt-oss Support + +**Issue:** vLLM's GPT-OSS MOE kernel implementation uses deprecated Triton routing API +**Fix:** Update to new Triton kernel API (topk and SparseMatrix) + +**Changes:** +- Replace deprecated `routing()` with `triton_topk()` +- Replace deprecated `routing_from_bitmatrix()` with `SparseMatrix()` +- Add support for `GatherIndx`, `ScatterIndx`, and new ragged tensor metadata + +**Enables support for:** +- Qwen3 models with MOE architecture +- gpt-oss models using Triton kernels +- Latest Triton kernel optimizations for Blackwell + +### 4. Triton Main Branch Requirement + +**Issue:** Official Triton 3.5.0 release has bugs with sm_121a +**Fix:** Build Triton from main branch with latest Blackwell fixes + +## Architecture-Specific Configuration + +The installer sets these critical environment variables: + +```bash +TORCH_CUDA_ARCH_LIST=12.1a # Blackwell sm_121 +VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer MOE optimization +TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler +TIKTOKEN_CACHE_DIR=$INSTALL_DIR/.tiktoken_cache # Cache tiktoken encodings locally +``` + +## Cluster Mode Setup + +To set up multi-node vLLM cluster: + +1. Run this installer on all nodes +2. Follow [CLUSTER.md](./CLUSTER.md) for configuration + +## Troubleshooting + +### Build Fails with "TypeError: can only concatenate str (not 'NoneType') to str" + +This is a known Triton editable-mode build issue. The installer works around this by: +- Building Triton in non-editable mode +- Or copying pre-built Triton from another node + +### Symbol Error: cutlass_moe_mm_sm100 + +**Symptom:** `ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100` +**Solution:** Ensure CMakeLists.txt fix is applied (done automatically by installer) + +### PyTorch CUDA Capability Warning + +**Symptom:** Warning about GPU capability 12.1 vs PyTorch max 12.0 +**Status:** Harmless warning - PyTorch 2.9.0+cu130 works correctly with GB10 + +### ImportError: No module named 'vllm' + +**Solution:** +```bash +source vllm-install/vllm_env.sh +python -c "import vllm; print(vllm.__version__)" +``` + +## File Structure + +``` +vllm-install/ +├── .vllm/ # Python virtual environment +├── vllm/ # vLLM source (editable install) +├── triton/ # Triton source +├── vllm_env.sh # Environment activation script +├── vllm-serve.sh # Start server +├── vllm-stop.sh # Stop server +├── vllm-status.sh # Check status +└── vllm-server.log # Server logs +``` + +## Manual Installation + +If you prefer to understand each step: + +```bash +# 1. Install uv package manager +curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" + +# 2. Create installation directory and Python virtual environment +mkdir -p vllm-install && cd vllm-install +uv venv .vllm --python 3.12 +source .vllm/bin/activate + +# 3. Install PyTorch with CUDA 13.0 +uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# 4. Clone and build Triton from main +git clone https://github.com/triton-lang/triton.git +cd triton +uv pip install pip cmake ninja pybind11 +TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas python -m pip install --no-build-isolation . + +# 5. Install additional dependencies +uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow + +# 6. Clone vLLM +cd .. +git clone --recursive https://github.com/vllm-project/vllm.git +cd vllm +git checkout v0.11.1rc3 + +# 7. Apply fixes (see scripts/apply-fixes.sh) +# 8. Build vLLM (see install.sh for full process) +``` + +## Version Information + +- **vLLM:** 0.11.1rc4.dev6+g66a168a19.d20251026 +- **PyTorch:** 2.9.0+cu130 +- **Triton:** 3.5.0+git4caa0328 +- **CUDA:** 13.0 +- **Python:** 3.12.3 +- **Target Architecture:** sm_121 (Blackwell GB10) + +## Contributing + +Issues and pull requests welcome! This installer is maintained by the DGX Spark community. + +## References + +- [NVIDIA Forum Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862) +- [vLLM GitHub](https://github.com/vllm-project/vllm) +- [Triton GitHub](https://github.com/triton-lang/triton) + +## License + +MIT License - See [LICENSE](./LICENSE) + +## Acknowledgments + +Developed and tested on NVIDIA DGX Spark systems. Special thanks to the vLLM and Triton communities. diff --git a/SUMMARY.md b/SUMMARY.md new file mode 100644 index 0000000..af8c19d --- /dev/null +++ b/SUMMARY.md @@ -0,0 +1,246 @@ +# Repository Summary + +## Overview + +This repository provides a **production-ready, one-command installation** of vLLM for NVIDIA DGX Spark systems with Blackwell GB10 GPUs (sm_121 architecture). + +## What's Included + +### Core Files + +1. **install.sh** (500+ lines) + - Fully automated installation script + - Pre-flight system checks + - 8-step installation pipeline + - Post-installation testing + - Command-line argument support + +2. **README.md** (300+ lines) + - Quick start guide + - System requirements + - Usage examples + - Critical fixes documentation + - Troubleshooting guide + +3. **CLUSTER.md** (400+ lines) + - Multi-node setup instructions + - Ray cluster configuration + - Tensor/pipeline parallelism + - Performance tuning + - Load balancing examples + +4. **requirements.txt** + - Complete dependency list + - PyTorch 2.9.0+cu130 + - All required packages + +### Helper Scripts (scripts/) + +- **vllm-serve.sh** - Start vLLM server with configurable model/port +- **vllm-stop.sh** - Gracefully stop server +- **vllm-status.sh** - Check server status and logs + +### Examples (examples/) + +- **basic_inference.py** - Simple Python API usage +- **api_client.py** - OpenAI-compatible REST API client +- **README.md** - Usage instructions and API examples + +### Configuration + +- **.gitignore** - Excludes build artifacts, venvs, logs +- **LICENSE** - MIT license + +## Technical Specifications + +### Target Platform +- **Hardware:** NVIDIA DGX Spark with GB10 GPU +- **Architecture:** Blackwell sm_121 (compute capability 12.1) +- **OS:** Ubuntu 22.04+ ARM64 +- **CUDA:** 13.0+ (driver 580.95.05+) + +### Software Stack +- **Python:** 3.12.3 +- **PyTorch:** 2.9.0+cu130 +- **Triton:** 3.5.0+git (from main branch) +- **vLLM:** 0.11.1rc4+ +- **Package Manager:** uv (fast Python package installer) + +### Critical Fixes Applied + +1. **CMakeLists.txt (line 671)** + - Added `12.0f` to SCALED_MM_ARCHS for SM100 MOE kernels + - Enables Blackwell GPU compilation + +2. **pyproject.toml** + - Changed `license = "Apache-2.0"` to `license = {text = "Apache-2.0"}` + - Removed deprecated `license-files` field + - Compatible with setuptools 77.0+ + +3. **Triton Build** + - Must use main branch (not release 3.5.0) + - Non-editable install to avoid setuptools bug + - Custom PTXAS path for CUDA integration + +### Environment Variables + +```bash +TORCH_CUDA_ARCH_LIST=12.1a # Blackwell architecture +VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer optimization +TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler +``` + +## Installation Overview + +The `install.sh` script performs these steps: + +1. **Pre-flight Checks** + - Verify ARM64 architecture + - Check NVIDIA GPU (GB10) + - Validate CUDA 13.0+ + - Ensure 50GB+ disk space + +2. **Install uv Package Manager** + - Fast Python package installer + - Required for efficient dependency resolution + +3. **Create Virtual Environment** + - Python 3.12 virtual environment + - Isolated from system packages + +4. **Install PyTorch** + - PyTorch 2.9.0 with CUDA 13.0 bindings + - Verify CUDA availability + +5. **Build Triton** + - Clone from GitHub main branch + - Build with Blackwell support + - Non-editable install + +6. **Install Dependencies** + - xgrammar, setuptools-scm + - apache-tvm-ffi (prerelease) + - Build tools + +7. **Clone and Fix vLLM** + - Clone v0.11.1rc3 + - Apply CMakeLists.txt fix + - Apply pyproject.toml fix + - Configure use_existing_torch + +8. **Build vLLM** + - 15-20 minute compilation + - All CUDA kernels for Blackwell + - Editable install for development + +9. **Create Helper Scripts** + - Environment activation script + - Server management scripts + - Logging configuration + +10. **Post-Installation Tests** + - Import vLLM + - Check CUDA availability + - Verify GPU detection + +## Quick Start + +```bash +# One-command installation +curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash + +# Or clone and run +git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git +cd dgx-spark-vllm-setup +./install.sh + +# Activate environment (assuming installation in current directory) +cd vllm-install +source vllm_env.sh + +# Start server +./vllm-serve.sh + +# Test API +curl http://localhost:8000/v1/models +``` + +## Repository Structure + +``` +dgx-spark-vllm-setup/ +├── README.md # Main documentation +├── CLUSTER.md # Multi-node setup guide +├── SUMMARY.md # This file +├── LICENSE # MIT license +├── .gitignore # Git ignore rules +├── install.sh # Main installation script +├── requirements.txt # Python dependencies +├── scripts/ +│ ├── vllm-serve.sh # Start vLLM server +│ ├── vllm-stop.sh # Stop server +│ └── vllm-status.sh # Check status +└── examples/ + ├── README.md # Examples documentation + ├── basic_inference.py # Python API example + └── api_client.py # REST API example +``` + +## Known Issues & Workarounds + +### Triton Editable Build Fails +**Error:** `TypeError: can only concatenate str (not 'NoneType') to str` +**Workaround:** Use non-editable install (`uv pip install --no-build-isolation .`) + +### PyTorch CUDA Capability Warning +**Warning:** GPU capability 12.1 vs PyTorch max 12.0 +**Status:** Harmless - PyTorch 2.9.0+cu130 works correctly with GB10 + +### apache-tvm-ffi Prerelease +**Error:** `No solution found when resolving dependencies` +**Fix:** Use `--prerelease=allow` flag with uv pip install + +## Testing Status + +- [OK] Single-node installation on spark-alpha.local +- [OK] Single-node installation on spark-omega.local +- [OK] vLLM server startup and API functionality +- [OK] Model inference (Qwen/Qwen2.5-0.5B-Instruct) +- [IN PROGRESS] Multi-node cluster mode (documented, not yet tested) + +## Future Enhancements + +- [ ] Add cluster mode testing results +- [ ] Include performance benchmarks +- [ ] Add Dockerfile for containerized deployment +- [ ] Create Ansible playbook for multi-node automation +- [ ] Add monitoring and logging setup (Prometheus/Grafana) +- [ ] Include model quantization examples (AWQ, GPTQ) + +## Contributing + +Contributions welcome! Please open issues or pull requests on GitHub. + +## Community & Support + +- **GitHub Issues:** Report bugs and feature requests +- **NVIDIA Forum:** [DGX Spark vLLM Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862) +- **vLLM Docs:** [Official Documentation](https://docs.vllm.ai/) + +## License + +MIT License - See LICENSE file for details. + +## Acknowledgments + +Developed and tested on NVIDIA DGX Spark systems. Special thanks to: +- vLLM project team +- Triton compiler team +- NVIDIA DGX Spark community +- Claude Code (AI assistant) for documentation automation + +--- + +**Version:** 1.0.0 +**Last Updated:** 2025-10-26 +**Tested On:** DGX Spark with GB10, CUDA 13.0, Ubuntu 22.04 ARM64 diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..332ad6b --- /dev/null +++ b/examples/README.md @@ -0,0 +1,225 @@ +# vLLM Examples for DGX Spark + +This directory contains example scripts demonstrating various ways to use vLLM on DGX Spark systems. + +## Prerequisites + +Ensure vLLM is installed and the environment is activated: + +```bash +# Assuming vllm-install is in your home directory +source ~/vllm-install/vllm_env.sh +``` + +## Examples + +### 1. Basic Inference (`basic_inference.py`) + +Simple text generation using the vLLM Python API. + +**Usage:** +```bash +python basic_inference.py +``` + +**What it demonstrates:** +- Loading a model with vLLM +- Configuring sampling parameters +- Generating multiple completions +- Batch processing + +### 2. API Client (`api_client.py`) + +Using vLLM's OpenAI-compatible REST API. + +**Prerequisites:** +Start the vLLM server first: +```bash +cd ~/vllm-install +./vllm-serve.sh +``` + +**Usage:** +```bash +python api_client.py +``` + +**What it demonstrates:** +- Listing available models +- Simple text completion +- Chat completion +- Streaming responses +- HTTP API interaction + +### 3. Batch Processing (`batch_processing.py`) + +Efficient processing of large batches of prompts. + +**Usage:** +```bash +python batch_processing.py +``` + +**What it demonstrates:** +- High-throughput batch inference +- Dynamic batching +- Memory-efficient processing +- Performance monitoring + +## Customization + +### Change Model + +Edit the model name in any example: + +```python +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", # Change this + trust_remote_code=True, + gpu_memory_utilization=0.9 +) +``` + +### Adjust Sampling Parameters + +Modify `SamplingParams` for different generation behavior: + +```python +sampling_params = SamplingParams( + temperature=0.7, # Lower = more deterministic (0.0-1.0) + top_p=0.95, # Nucleus sampling threshold + max_tokens=100, # Maximum tokens to generate + top_k=50, # Top-k sampling + repetition_penalty=1.1 # Penalize repetition +) +``` + +### GPU Memory Management + +Adjust memory utilization: + +```python +llm = LLM( + model="...", + gpu_memory_utilization=0.9, # Use 90% of GPU memory (0.0-1.0) + max_model_len=2048 # Maximum sequence length +) +``` + +## API Server Examples + +### cURL Examples + +**List models:** +```bash +curl http://localhost:8000/v1/models +``` + +**Simple completion:** +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "The meaning of life is", + "max_tokens": 50, + "temperature": 0.7 + }' +``` + +**Chat completion:** +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is DGX Spark?"} + ], + "max_tokens": 100, + "temperature": 0.7 + }' +``` + +**Streaming completion:** +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "Write a story about", + "max_tokens": 100, + "stream": true + }' +``` + +## Tested Models + +These models work well on DGX Spark GB10: + +- `Qwen/Qwen2.5-0.5B-Instruct` (small, fast) +- `Qwen/Qwen2.5-7B-Instruct` (balanced) +- `meta-llama/Llama-3.1-8B-Instruct` (high quality) +- `meta-llama/Llama-3.1-70B-Instruct` (requires tensor parallelism) + +## Performance Tips + +1. **Use GPU memory efficiently:** + - Set `gpu_memory_utilization=0.95` for maximum throughput + - Lower for models close to GPU memory limit + +2. **Batch processing:** + - Process multiple prompts together + - vLLM automatically optimizes batch sizes + +3. **Quantization:** + - For larger models, use quantization: + ```python + llm = LLM(model="...", quantization="awq") + ``` + +4. **Tensor parallelism:** + - For models > 20GB, use multiple GPUs: + ```python + llm = LLM(model="...", tensor_parallel_size=2) + ``` + +## Troubleshooting + +### Out of Memory + +Reduce `max_model_len` or `gpu_memory_utilization`: + +```python +llm = LLM( + model="...", + gpu_memory_utilization=0.8, + max_model_len=2048 +) +``` + +### Slow Generation + +Check if model is loaded correctly: + +```python +python -c "import vllm; print(vllm.__version__)" +nvidia-smi # Check GPU utilization +``` + +### Connection Refused (API) + +Ensure server is running: + +```bash +cd ~/vllm-install +./vllm-status.sh +``` + +## More Resources + +- [vLLM Documentation](https://docs.vllm.ai/) +- [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) +- [Main README](../README.md) +- [Cluster Setup](../CLUSTER.md) diff --git a/examples/api_client.py b/examples/api_client.py new file mode 100644 index 0000000..d25dd0b --- /dev/null +++ b/examples/api_client.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +vLLM OpenAI-Compatible API Client Example +Demonstrates using vLLM's OpenAI-compatible API endpoints +""" + +import requests +import json +from typing import Dict, List + +class VLLMClient: + """Simple client for vLLM OpenAI-compatible API""" + + def __init__(self, base_url: str = "http://localhost:8000"): + self.base_url = base_url.rstrip('/') + + def list_models(self) -> List[Dict]: + """List available models""" + response = requests.get(f"{self.base_url}/v1/models") + response.raise_for_status() + return response.json() + + def complete( + self, + prompt: str, + model: str = None, + max_tokens: int = 100, + temperature: float = 0.7, + stream: bool = False + ) -> Dict: + """Generate completion""" + + # Get model name if not specified + if model is None: + models = self.list_models() + model = models['data'][0]['id'] + + payload = { + "model": model, + "prompt": prompt, + "max_tokens": max_tokens, + "temperature": temperature, + "stream": stream + } + + response = requests.post( + f"{self.base_url}/v1/completions", + json=payload, + headers={"Content-Type": "application/json"}, + stream=stream + ) + response.raise_for_status() + + if stream: + return response.iter_lines() + else: + return response.json() + + def chat( + self, + messages: List[Dict[str, str]], + model: str = None, + max_tokens: int = 100, + temperature: float = 0.7, + stream: bool = False + ) -> Dict: + """Generate chat completion""" + + # Get model name if not specified + if model is None: + models = self.list_models() + model = models['data'][0]['id'] + + payload = { + "model": model, + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "stream": stream + } + + response = requests.post( + f"{self.base_url}/v1/chat/completions", + json=payload, + headers={"Content-Type": "application/json"}, + stream=stream + ) + response.raise_for_status() + + if stream: + return response.iter_lines() + else: + return response.json() + + +def main(): + # Initialize client + client = VLLMClient("http://localhost:8000") + + print("="*60) + print("vLLM API Client Examples") + print("="*60) + + # Example 1: List models + print("\n1. Listing available models...") + models = client.list_models() + for model in models['data']: + print(f" - {model['id']}") + + # Example 2: Simple completion + print("\n2. Simple completion...") + result = client.complete( + prompt="The capital of France is", + max_tokens=10, + temperature=0.0 + ) + print(f" Prompt: The capital of France is") + print(f" Response: {result['choices'][0]['text']}") + + # Example 3: Chat completion + print("\n3. Chat completion...") + messages = [ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "What is the Blackwell GPU architecture?"} + ] + result = client.chat( + messages=messages, + max_tokens=100, + temperature=0.7 + ) + print(f" User: {messages[1]['content']}") + print(f" Assistant: {result['choices'][0]['message']['content']}") + + # Example 4: Streaming completion + print("\n4. Streaming completion...") + print(" Prompt: Write a short poem about AI") + print(" Response: ", end="", flush=True) + + stream = client.complete( + prompt="Write a short poem about AI", + max_tokens=50, + temperature=0.8, + stream=True + ) + + for line in stream: + if line: + try: + data = json.loads(line.decode('utf-8').removeprefix('data: ')) + if 'choices' in data and len(data['choices']) > 0: + token = data['choices'][0].get('text', '') + print(token, end="", flush=True) + except (json.JSONDecodeError, AttributeError): + pass + + print("\n") + print("="*60) + +if __name__ == "__main__": + main() diff --git a/examples/basic_inference.py b/examples/basic_inference.py new file mode 100644 index 0000000..4ffc1d4 --- /dev/null +++ b/examples/basic_inference.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Basic vLLM Inference Example for DGX Spark +Demonstrates simple text generation using the vLLM Python API +""" + +from vllm import LLM, SamplingParams + +def main(): + # Initialize the model + # Use a smaller model for testing, replace with your preferred model + print("Loading model...") + llm = LLM( + model="Qwen/Qwen2.5-0.5B-Instruct", + trust_remote_code=True, + gpu_memory_utilization=0.9, + max_model_len=2048 + ) + + # Define prompts + prompts = [ + "What is the NVIDIA DGX Spark?", + "Explain the Blackwell GPU architecture in simple terms.", + "Write a haiku about artificial intelligence." + ] + + # Configure sampling parameters + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.95, + max_tokens=100, + stop=["", "\n\n\n"] + ) + + # Generate responses + print("\nGenerating responses...\n") + outputs = llm.generate(prompts, sampling_params) + + # Print results + for i, output in enumerate(outputs): + print(f"{'='*60}") + print(f"Prompt {i+1}: {prompts[i]}") + print(f"{'-'*60}") + print(f"Response: {output.outputs[0].text}") + print(f"{'='*60}\n") + +if __name__ == "__main__": + main() diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..cfe8f9c --- /dev/null +++ b/install.sh @@ -0,0 +1,777 @@ +#!/bin/bash +################################################################################ +# vLLM Installation Script for NVIDIA DGX Spark (Blackwell GB10) +# Version: 1.1.0 +# Author: DGX Spark Community +# License: MIT +# +# This script automates the complete installation of vLLM on DGX Spark systems +# with Blackwell GB10 GPUs, including all necessary fixes and optimizations. +# +# Usage: ./install.sh [OPTIONS] +# Can also be run via: curl -fsSL /install.sh | bash +# +# Options: +# --install-dir DIR Installation directory (default: $PWD/vllm-install) +# --vllm-version HASH vLLM git commit (default: 66a168a19 - tested with Blackwell) +# --python-version VER Python version (default: 3.12) +# --skip-tests Skip post-installation tests +# --help Show this help message +################################################################################ + +set -e # Exit on error +set -o pipefail # Catch errors in pipes + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Default configuration +INSTALL_DIR="$PWD/vllm-install" +VLLM_VERSION="66a168a197ba214a5b70a74fa2e713c9eeb3251a" # vLLM commit with Blackwell fixes +TRITON_VERSION="4caa0328bf8df64896dd5f6fb9df41b0eb2e750a" # Triton commit that works with Blackwell +PYTHON_VERSION="3.12" +SKIP_TESTS=false + +# GitHub raw URL for downloading repo assets when run outside the repo +REPO_RAW_URL="https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main" + +# Script directory (only meaningful when run from a local clone) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd || echo "")" + +################################################################################ +# Helper Functions +################################################################################ + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_header() { + echo "" + echo -e "${BLUE}========================================${NC}" + echo -e "${BLUE}$1${NC}" + echo -e "${BLUE}========================================${NC}" + echo "" +} + +check_command() { + if command -v "$1" &> /dev/null; then + return 0 + else + return 1 + fi +} + +# Auto-confirm when stdin is not a terminal (e.g. curl | bash) +confirm_or_default_yes() { + local prompt="$1" + if [ -t 0 ]; then + read -p "$prompt (y/N) " -n 1 -r + echo + [[ $REPLY =~ ^[Yy]$ ]] + else + log_info "Non-interactive mode: auto-confirming" + return 0 + fi +} + +################################################################################ +# Pre-flight Checks +################################################################################ + +preflight_checks() { + print_header "Pre-flight System Checks" + + log_info "Checking system requirements..." + + # Check if running on ARM64 + ARCH=$(uname -m) + if [[ "$ARCH" != "aarch64" ]] && [[ "$ARCH" != "arm64" ]]; then + log_warning "This script is designed for ARM64 architecture (DGX Spark)" + log_warning "Detected architecture: $ARCH" + fi + + # Check for NVIDIA GPU + if ! check_command nvidia-smi; then + log_error "nvidia-smi not found. NVIDIA drivers required." + exit 1 + fi + + # Check GPU type + GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1) + log_info "Detected GPU: $GPU_NAME" + + if [[ ! "$GPU_NAME" =~ "GB10" ]]; then + log_warning "This script is optimized for NVIDIA GB10 (Blackwell)" + log_warning "Your GPU: $GPU_NAME" + if ! confirm_or_default_yes "Continue anyway?"; then + exit 1 + fi + fi + + # Check CUDA + if ! check_command nvcc; then + # Check common CUDA install locations + if [ -x "/usr/local/cuda/bin/nvcc" ]; then + export PATH="/usr/local/cuda/bin:$PATH" + log_info "Found CUDA at /usr/local/cuda, added to PATH" + else + log_error "CUDA toolkit not found. Please install CUDA 13.0+" + exit 1 + fi + fi + + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | cut -d',' -f1) + log_info "CUDA version: $CUDA_VERSION" + + # Check for Python development headers (required for Triton build) + PYTHON_INCLUDE="/usr/include/python${PYTHON_VERSION}/patchlevel.h" + if [ ! -f "$PYTHON_INCLUDE" ]; then + log_warning "Python ${PYTHON_VERSION} development headers not found" + log_info "Installing python${PYTHON_VERSION}-dev (requires sudo)..." + if sudo apt-get install -y "python${PYTHON_VERSION}-dev"; then + log_success "python${PYTHON_VERSION}-dev installed" + else + log_error "Failed to install python${PYTHON_VERSION}-dev" + log_error "Please install manually: sudo apt install python${PYTHON_VERSION}-dev" + exit 1 + fi + else + log_info "Python ${PYTHON_VERSION} development headers found" + fi + + # Check disk space (need ~50GB) + AVAILABLE_SPACE=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | sed 's/G//') + if [[ "$AVAILABLE_SPACE" -lt 50 ]]; then + log_error "Insufficient disk space. Need at least 50GB, have ${AVAILABLE_SPACE}GB" + exit 1 + fi + + log_success "Pre-flight checks passed!" +} + +################################################################################ +# Install uv Package Manager +################################################################################ + +install_uv() { + print_header "Step 1/8: Installing uv Package Manager" + + if check_command uv; then + UV_VERSION=$(uv --version | awk '{print $2}') + log_info "uv already installed: v$UV_VERSION" + else + log_info "Installing uv..." + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$PATH" + log_success "uv installed successfully" + fi + + # Verify installation + if ! check_command uv; then + log_error "uv installation failed" + exit 1 + fi +} + +################################################################################ +# Create Python Virtual Environment +################################################################################ + +create_venv() { + print_header "Step 2/8: Creating Python Virtual Environment" + + VENV_DIR="$INSTALL_DIR/.vllm" + + if [ -d "$VENV_DIR" ]; then + log_warning "Virtual environment already exists at $VENV_DIR" + if confirm_or_default_yes "Remove and recreate?"; then + rm -rf "$VENV_DIR" + else + log_info "Using existing virtual environment" + return + fi + fi + + log_info "Creating Python $PYTHON_VERSION virtual environment..." + mkdir -p "$INSTALL_DIR" + cd "$INSTALL_DIR" + uv venv .vllm --python "$PYTHON_VERSION" + + # Upgrade setuptools to 77+ so PEP 639 license fields are supported + # (fixes flashinfer-python build failure) + log_info "Upgrading setuptools in venv for PEP 639 license support..." + uv pip install --python "$VENV_DIR/bin/python" --upgrade setuptools + + log_success "Virtual environment created at $VENV_DIR" +} + +################################################################################ +# Install PyTorch +################################################################################ + +install_pytorch() { + print_header "Step 3/8: Installing PyTorch with CUDA 13.0" + + source "$INSTALL_DIR/.vllm/bin/activate" + + log_info "Installing latest PyTorch for cu130..." + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + + # Verify PyTorch installation + log_info "Verifying PyTorch installation..." + python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())" + + log_success "PyTorch installed successfully" +} + +################################################################################ +# Clone and Build Triton +################################################################################ + +install_triton() { + print_header "Step 4/8: Installing Triton from Main Branch" + + TRITON_DIR="$INSTALL_DIR/triton" + + if [ -d "$TRITON_DIR" ]; then + log_info "Triton directory exists, updating..." + cd "$TRITON_DIR" + git fetch + else + log_info "Cloning Triton repository..." + cd "$INSTALL_DIR" + git clone https://github.com/triton-lang/triton.git + cd triton + fi + + log_info "Checking out Triton commit $TRITON_VERSION (tested with Blackwell)..." + git checkout "$TRITON_VERSION" + git submodule update --init --recursive + + log_info "Installing Triton build dependencies..." + source "$INSTALL_DIR/.vllm/bin/activate" + uv pip install pip cmake ninja pybind11 + + log_info "Building Triton (this takes ~5 minutes)..." + export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas + export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) + python -m pip install --no-build-isolation -v . 2>&1 | tee "$INSTALL_DIR/triton-build.log" + + if [ ${PIPESTATUS[0]} -ne 0 ]; then + log_error "Triton build failed. See $INSTALL_DIR/triton-build.log for details" + exit 1 + fi + + # Record the installed triton version so we can protect it later + TRITON_INSTALLED_VERSION=$(python -c "import triton; print(triton.__version__)" 2>/dev/null || echo "unknown") + log_info "Triton version installed: $TRITON_INSTALLED_VERSION" + + log_success "Triton installed successfully" +} + +################################################################################ +# Install Additional Dependencies +################################################################################ + +install_dependencies() { + print_header "Step 5/8: Installing Additional Dependencies" + + source "$INSTALL_DIR/.vllm/bin/activate" + + log_info "Installing xgrammar, setuptools-scm, and apache-tvm-ffi..." + uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow + + log_success "Dependencies installed successfully" +} + +################################################################################ +# Clone vLLM +################################################################################ + +clone_vllm() { + print_header "Step 6/8: Cloning vLLM Repository" + + VLLM_DIR="$INSTALL_DIR/vllm" + + if [ -d "$VLLM_DIR" ]; then + log_warning "vLLM directory already exists at $VLLM_DIR" + if confirm_or_default_yes "Remove and re-clone?"; then + rm -rf "$VLLM_DIR" + else + log_info "Using existing vLLM directory" + cd "$VLLM_DIR" + return + fi + fi + + log_info "Cloning vLLM $VLLM_VERSION..." + cd "$INSTALL_DIR" + git clone --recursive https://github.com/vllm-project/vllm.git + cd vllm + git checkout "$VLLM_VERSION" + git submodule update --init --recursive + + log_success "vLLM repository cloned" +} + +################################################################################ +# Apply Critical Fixes +################################################################################ + +apply_fixes() { + print_header "Step 7/8: Applying Critical Fixes" + + cd "$INSTALL_DIR/vllm" + + # Fix 1: pyproject.toml license field + log_info "Fixing pyproject.toml license field..." + sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' pyproject.toml + sed -i '/^license-files = /d' pyproject.toml + + # Fix 2: CMakeLists.txt SM100/SM120 MOE kernels (check if already applied) + if grep -q 'cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"' CMakeLists.txt; then + log_info "CMakeLists.txt SM100/SM120 fix already applied" + else + log_info "Applying CMakeLists.txt SM100/SM120 fix..." + sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt + sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a"/' CMakeLists.txt + fi + + # Fix 3: flashinfer-python license field (pre-emptive fix) + log_info "Pre-fixing flashinfer-python license issue..." + rm -rf "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" 2>/dev/null || true + + # Fix 4: GPT-OSS Triton MOE kernels for Qwen3/gpt-oss support + # Try local repo patches/ first, then download from GitHub + PATCH_FILE="" + if [ -f "$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch" ]; then + PATCH_FILE="$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch" + else + log_info "Downloading GPT-OSS Triton MOE patch from repository..." + PATCH_FILE="$INSTALL_DIR/gpt_oss_triton_moe.patch" + if curl -fsSL "$REPO_RAW_URL/patches/gpt_oss_triton_moe.patch" -o "$PATCH_FILE" 2>/dev/null; then + log_info "Patch downloaded successfully" + else + PATCH_FILE="" + log_warning "Could not download GPT-OSS Triton MOE patch (skipping)" + fi + fi + + if [ -n "$PATCH_FILE" ] && [ -f "$PATCH_FILE" ]; then + log_info "Applying GPT-OSS Triton MOE kernel patch for Qwen3/gpt-oss support..." + if patch --dry-run -p1 < "$PATCH_FILE" > /dev/null 2>&1; then + patch -p1 < "$PATCH_FILE" + log_success "GPT-OSS Triton MOE kernel patch applied" + else + log_warning "GPT-OSS Triton MOE kernel patch already applied or conflicts" + fi + fi + + # Configure use_existing_torch + log_info "Configuring vLLM to use existing PyTorch..." + python3 use_existing_torch.py + + log_success "All fixes applied successfully" +} + +################################################################################ +# Build and Install vLLM +################################################################################ + +build_vllm() { + print_header "Step 8/8: Building vLLM (15-20 minutes)" + + cd "$INSTALL_DIR/vllm" + source "$INSTALL_DIR/.vllm/bin/activate" + + # Set environment variables + export TORCH_CUDA_ARCH_LIST=12.1a + export VLLM_USE_FLASHINFER_MXFP4_MOE=1 + export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas + + # Create a constraints file to prevent uv from replacing our + # custom-built Triton with a PyPI version + log_info "Creating constraints to protect pinned Triton build..." + TRITON_CONSTRAINT="$INSTALL_DIR/constraints.txt" + TRITON_INSTALLED=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "") + if [ -n "$TRITON_INSTALLED" ]; then + echo "triton==${TRITON_INSTALLED}" > "$TRITON_CONSTRAINT" + log_info "Pinning triton==${TRITON_INSTALLED} during vLLM build" + else + echo "" > "$TRITON_CONSTRAINT" + log_warning "Could not detect installed Triton version" + fi + + log_info "Starting vLLM build..." + log_warning "This will take 15-20 minutes. Go grab a coffee!" + + set +e # Don't exit on error, we'll handle it + UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \ + --no-build-isolation --prerelease=allow -e . \ + 2>&1 | tee "$INSTALL_DIR/vllm-build.log" + BUILD_STATUS=${PIPESTATUS[0]} + set -e + + if [ $BUILD_STATUS -ne 0 ]; then + if grep -q "flashinfer.*license.*must be valid" "$INSTALL_DIR/vllm-build.log"; then + log_warning "Build failed due to flashinfer-python license issue" + log_info "Upgrading setuptools and retrying..." + + # Ensure setuptools is new enough + uv pip install --upgrade setuptools + + # Also patch the cached flashinfer pyproject.toml as a belt-and-suspenders fix + find "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" -name "pyproject.toml" 2>/dev/null | while read f; do + sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' "$f" + sed -i '/^license-files = /d' "$f" + done + + log_info "Retrying vLLM build..." + UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \ + --no-build-isolation --prerelease=allow -e . + else + log_error "vLLM build failed. See $INSTALL_DIR/vllm-build.log for details" + exit 1 + fi + fi + + # Verify Triton wasn't replaced + TRITON_AFTER=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "unknown") + if [ -n "$TRITON_INSTALLED" ] && [ "$TRITON_AFTER" != "$TRITON_INSTALLED" ]; then + log_warning "Triton was changed during vLLM install: $TRITON_INSTALLED -> $TRITON_AFTER" + log_warning "Rebuilding pinned Triton from source..." + cd "$INSTALL_DIR/triton" + git checkout "$TRITON_VERSION" + export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) + python -m pip install --no-build-isolation --force-reinstall -v . + cd "$INSTALL_DIR/vllm" + fi + + log_success "vLLM built successfully!" +} + +################################################################################ +# Create Helper Scripts +################################################################################ + +create_helper_scripts() { + print_header "Creating Helper Scripts" + + # Create environment activation script + log_info "Creating vllm_env.sh..." + cat > "$INSTALL_DIR/vllm_env.sh" << 'ENVEOF' +#!/bin/bash +# vLLM Environment Configuration for DGX Spark +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/.vllm/bin/activate" +export TORCH_CUDA_ARCH_LIST=12.1a +export VLLM_USE_FLASHINFER_MXFP4_MOE=1 +CUDA_PATH=$(ls -d /usr/local/cuda* 2>/dev/null | head -1) +export TRITON_PTXAS_PATH="$CUDA_PATH/bin/ptxas" +export PATH="$CUDA_PATH/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH" +# Cache tiktoken encodings to avoid re-downloading +export TIKTOKEN_CACHE_DIR="$SCRIPT_DIR/.tiktoken_cache" +mkdir -p "$TIKTOKEN_CACHE_DIR" +echo "=== vLLM Environment Active ===" +echo "Virtual env: $VIRTUAL_ENV" +echo "CUDA arch: $TORCH_CUDA_ARCH_LIST" +echo "Python: $(which python)" +echo "===============================" +ENVEOF + chmod +x "$INSTALL_DIR/vllm_env.sh" + + # Create vllm-serve.sh (embedded so it works with curl|bash) + log_info "Creating vllm-serve.sh..." + cat > "$INSTALL_DIR/vllm-serve.sh" << 'SERVEEOF' +#!/bin/bash +# vLLM Server Startup Script for DGX Spark +# Usage: ./vllm-serve.sh [port] + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}" +PORT="${2:-8000}" +VLLM_DIR="$SCRIPT_DIR/vllm" +ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh" +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" +LOG_FILE="$SCRIPT_DIR/vllm-server.log" + +# Check if server is already running +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE") + if ps -p $PID > /dev/null 2>&1; then + echo "ERROR: vLLM server is already running (PID: $PID)" + echo "Use ./vllm-stop.sh to stop it first" + exit 1 + fi +fi + +# Source environment +source "$ENV_SCRIPT" + +echo "----------------------------------------------------------------------" +echo "Starting vLLM Server on DGX Spark" +echo "----------------------------------------------------------------------" +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Log file: $LOG_FILE" +echo "PID file: $PID_FILE" +echo "----------------------------------------------------------------------" + +# Start server in background +cd "$VLLM_DIR" +nohup python -m vllm.entrypoints.openai.api_server \ + --model "$MODEL" \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port "$PORT" \ + --gpu-memory-utilization 0.9 \ + > "$LOG_FILE" 2>&1 & + +echo $! > "$PID_FILE" +echo "OK: Server started with PID: $(cat $PID_FILE)" +echo "OK: Waiting for server to be ready..." + +sleep 5 +if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then + echo "OK: Server is running!" + echo "" + echo "Test with: curl http://localhost:$PORT/v1/models" + echo "View logs: tail -f $LOG_FILE" + echo "Stop server: ./vllm-stop.sh" +else + echo "ERROR: Server failed to start. Check logs: $LOG_FILE" + rm -f "$PID_FILE" + exit 1 +fi +SERVEEOF + chmod +x "$INSTALL_DIR/vllm-serve.sh" + + # Create vllm-stop.sh + log_info "Creating vllm-stop.sh..." + cat > "$INSTALL_DIR/vllm-stop.sh" << 'STOPEOF' +#!/bin/bash +# vLLM Server Stop Script for DGX Spark + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" + +if [ ! -f "$PID_FILE" ]; then + echo "No vLLM server PID file found. Server may not be running." + exit 0 +fi + +PID=$(cat "$PID_FILE") + +if ! ps -p $PID > /dev/null 2>&1; then + echo "vLLM server (PID: $PID) is not running. Cleaning up PID file." + rm -f "$PID_FILE" + exit 0 +fi + +echo "Stopping vLLM server (PID: $PID)..." +kill $PID + +for i in {1..10}; do + if ! ps -p $PID > /dev/null 2>&1; then + echo "OK: Server stopped successfully" + rm -f "$PID_FILE" + exit 0 + fi + sleep 1 +done + +if ps -p $PID > /dev/null 2>&1; then + echo "Server did not stop gracefully. Force killing..." + kill -9 $PID + sleep 1 + if ! ps -p $PID > /dev/null 2>&1; then + echo "OK: Server force stopped" + rm -f "$PID_FILE" + else + echo "ERROR: Failed to stop server" + exit 1 + fi +fi +STOPEOF + chmod +x "$INSTALL_DIR/vllm-stop.sh" + + # Create vllm-status.sh + log_info "Creating vllm-status.sh..." + cat > "$INSTALL_DIR/vllm-status.sh" << 'STATUSEOF' +#!/bin/bash +# vLLM Server Status Script for DGX Spark + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" +LOG_FILE="$SCRIPT_DIR/vllm-server.log" + +echo "----------------------------------------------------------------------" +echo "vLLM Server Status on DGX Spark" +echo "----------------------------------------------------------------------" + +if [ ! -f "$PID_FILE" ]; then + echo "Status: NOT RUNNING (no PID file found)" + exit 0 +fi + +PID=$(cat "$PID_FILE") + +if ! ps -p $PID > /dev/null 2>&1; then + echo "Status: NOT RUNNING (stale PID file)" + echo "Cleaning up PID file..." + rm -f "$PID_FILE" + exit 0 +fi + +echo "Status: RUNNING" +echo "PID: $PID" +echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')" +echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%" +echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%" +echo "" + +if [ -f "$LOG_FILE" ]; then + echo "Recent log entries (last 10 lines):" + echo "----------------------------------------------------------------------" + tail -n 10 "$LOG_FILE" +else + echo "Log file not found: $LOG_FILE" +fi + +echo "" +echo "----------------------------------------------------------------------" +STATUSEOF + chmod +x "$INSTALL_DIR/vllm-status.sh" + + log_success "Helper scripts created in $INSTALL_DIR" +} + +################################################################################ +# Post-Installation Tests +################################################################################ + +run_tests() { + if [ "$SKIP_TESTS" = true ]; then + log_info "Skipping post-installation tests" + return + fi + + print_header "Post-Installation Tests" + + source "$INSTALL_DIR/vllm_env.sh" + + log_info "Test 1: Import vLLM..." + python -c "import vllm; print('vLLM version:', vllm.__version__)" + + log_info "Test 2: Check CUDA availability..." + python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print('CUDA available')" + + log_info "Test 3: Check GPU detection..." + python -c "import torch; print('GPU count:', torch.cuda.device_count()); print('GPU name:', torch.cuda.get_device_name(0))" + + log_success "All tests passed!" +} + +################################################################################ +# Parse Command Line Arguments +################################################################################ + +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + --install-dir) + INSTALL_DIR="$2" + shift 2 + ;; + --vllm-version) + VLLM_VERSION="$2" + shift 2 + ;; + --python-version) + PYTHON_VERSION="$2" + shift 2 + ;; + --skip-tests) + SKIP_TESTS=true + shift + ;; + --help) + head -20 "$0" | grep "^#" | sed 's/^# //' + exit 0 + ;; + *) + log_error "Unknown option: $1" + log_info "Use --help for usage information" + exit 1 + ;; + esac + done +} + +################################################################################ +# Main Installation Flow +################################################################################ + +main() { + parse_args "$@" + + print_header "vLLM Installation for DGX Spark (Blackwell GB10)" + log_info "Installation directory: $INSTALL_DIR" + log_info "vLLM version: $VLLM_VERSION" + log_info "Python version: $PYTHON_VERSION" + echo "" + + preflight_checks + install_uv + create_venv + install_pytorch + install_triton + install_dependencies + clone_vllm + apply_fixes + build_vllm + create_helper_scripts + run_tests + + print_header "Installation Complete!" + echo "" + log_success "vLLM has been successfully installed!" + echo "" + echo -e "${GREEN}Next steps:${NC}" + echo "1. Activate the environment:" + echo " ${BLUE}source $INSTALL_DIR/vllm_env.sh${NC}" + echo "" + echo "2. Start vLLM server:" + echo " ${BLUE}cd $INSTALL_DIR${NC}" + echo " ${BLUE}./vllm-serve.sh${NC}" + echo "" + echo "3. Test the API:" + echo " ${BLUE}curl http://localhost:8000/v1/models${NC}" + echo "" + echo "For more information, see README.md" + echo "" +} + +# Run main function +main "$@" diff --git a/patches/gpt_oss_triton_moe.patch b/patches/gpt_oss_triton_moe.patch new file mode 100644 index 0000000..353539f --- /dev/null +++ b/patches/gpt_oss_triton_moe.patch @@ -0,0 +1,77 @@ +diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +index badedfc54..e05c0eea4 100644 +--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py ++++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +@@ -20,9 +20,16 @@ logger = init_logger(__name__) + if has_triton_kernels(): + try: + import triton_kernels.swiglu +- from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs +- from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix +- from triton_kernels.tensor import Bitmatrix ++ from triton_kernels.matmul_ogs import ( ++ FnSpecs, ++ FusedActivation, ++ GatherIndx, ++ RoutingData, ++ ScatterIndx, ++ matmul_ogs, ++ ) ++ from triton_kernels.tensor import BIT, Bitmatrix, SparseMatrix, make_ragged_tensor_metadata ++ from triton_kernels.topk import topk as triton_topk + except (AttributeError, ImportError) as e: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " +@@ -84,8 +91,17 @@ def triton_kernel_moe_forward( + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + ) -> torch.Tensor: +- routing_data, gather_idx, scatter_idx = routing( +- gating_output, topk, sm_first=not renormalize ++ # Use new topk API instead of deprecated routing ++ sm_first = not renormalize ++ if sm_first: ++ gating_output = torch.softmax(gating_output, dim=-1) ++ sparse_logits = triton_topk( ++ gating_output, topk, apply_softmax=not sm_first, y_indx=None, n_rows=None ++ ) ++ ++ # Convert to routing data using the existing make_routing_data function ++ routing_data, gather_idx, scatter_idx = make_routing_data( ++ sparse_logits.indx, sparse_logits.vals, gating_output.shape[-1] + ) + + return triton_kernel_fused_experts( +@@ -202,14 +218,29 @@ def make_routing_data( + bitmatrix_shape = [n_rows, bm_cols * 32] + bitmatrix_shape_max = [n_rows, None] + bitmatrix = Bitmatrix( +- bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None ++ bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max + ) + + # matmul_ogs expects invalid topk_weights to be -1s + topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights) +- routing_data, gather_indx, scatter_indx = routing_from_bitmatrix( +- bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk ++ ++ # Use new SparseMatrix API instead of deprecated routing_from_bitmatrix ++ sparse_logits = SparseMatrix(indx=topk_ids, vals=topk_weights, mask=bitmatrix) ++ dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx ++ combine_indx = sparse_logits.mask_metadata.row_sorted_indx ++ ragged_batch_metadata = make_ragged_tensor_metadata( ++ sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0] ++ ) ++ gate_scal = sparse_logits.vals.flatten()[combine_indx] ++ routing_data = RoutingData( ++ gate_scal, ++ ragged_batch_metadata.block_sizes, ++ num_local_experts, ++ num_topk, ++ ragged_batch_metadata, + ) ++ gather_indx = GatherIndx(combine_indx, dispatch_indx) ++ scatter_indx = ScatterIndx(dispatch_indx, combine_indx) + + return routing_data, gather_indx, scatter_indx + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1f770ba --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +# Core Dependencies for vLLM on DGX Spark (Blackwell GB10) +# Note: This file is for reference only. The install.sh script handles +# all dependency installation with proper ordering and build flags. + +# PyTorch with CUDA 13.0 support (installs latest available on cu130 index) +--index-url https://download.pytorch.org/whl/cu130 +torch +torchvision +torchaudio + +# Triton (must be built from source - see install.sh) +# Pinned to commit 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a +# triton @ git+https://github.com/triton-lang/triton.git@4caa0328 + +# vLLM dependencies +xgrammar>=0.1.26 +setuptools-scm>=9.2.2 +setuptools>=77.0.0 # Required for PEP 639 license field support +apache-tvm-ffi==0.1.0b15 # Pre-release required + +# Build dependencies +pybind11>=3.0.0 +ninja>=1.13.0 + +# Optional but recommended +flashinfer-python>=0.4.1 +transformers>=4.57.0 +huggingface-hub>=0.36.0 diff --git a/scripts/vllm-serve.sh b/scripts/vllm-serve.sh new file mode 100644 index 0000000..622a6b6 --- /dev/null +++ b/scripts/vllm-serve.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# vLLM Server Startup Script for DGX Spark +# Usage: ./vllm-serve.sh [port] + +set -e + +# Determine installation directory (where this script is located) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Configuration +MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}" +PORT="${2:-8000}" +VLLM_DIR="$SCRIPT_DIR/vllm" +ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh" +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" +LOG_FILE="$SCRIPT_DIR/vllm-server.log" + +# Check if server is already running +if [ -f "$PID_FILE" ]; then + PID=$(cat "$PID_FILE") + if ps -p $PID > /dev/null 2>&1; then + echo "ERROR: vLLM server is already running (PID: $PID)" + echo "Use ./vllm-stop.sh to stop it first" + exit 1 + fi +fi + +# Source environment +source "$ENV_SCRIPT" + +echo "=" | tr '=' '-' | head -c 70 && echo +echo "Starting vLLM Server on DGX Spark" +echo "=" | tr '=' '-' | head -c 70 && echo +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Log file: $LOG_FILE" +echo "PID file: $PID_FILE" +echo "=" | tr '=' '-' | head -c 70 && echo + +# Start server in background +cd "$VLLM_DIR" +nohup python -m vllm.entrypoints.openai.api_server \ + --model "$MODEL" \ + --trust-remote-code \ + --host 0.0.0.0 \ + --port "$PORT" \ + --gpu-memory-utilization 0.9 \ + > "$LOG_FILE" 2>&1 & + +# Save PID +echo $! > "$PID_FILE" +echo "OK: Server started with PID: $(cat $PID_FILE)" +echo "OK: Waiting for server to be ready..." + +# Wait for server to be ready +sleep 5 +if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then + echo "OK: Server is running!" + echo "" + echo "Test with: curl http://localhost:$PORT/v1/models" + echo "View logs: tail -f $LOG_FILE" + echo "Stop server: ./vllm-stop.sh" +else + echo "ERROR: Server failed to start. Check logs: $LOG_FILE" + rm -f "$PID_FILE" + exit 1 +fi diff --git a/scripts/vllm-status.sh b/scripts/vllm-status.sh new file mode 100644 index 0000000..8ddc249 --- /dev/null +++ b/scripts/vllm-status.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# vLLM Server Status Script for DGX Spark + +# Determine installation directory (where this script is located) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" +LOG_FILE="$SCRIPT_DIR/vllm-server.log" + +echo "=" | tr '=' '-' | head -c 70 && echo +echo "vLLM Server Status on DGX Spark" +echo "=" | tr '=' '-' | head -c 70 && echo + +if [ ! -f "$PID_FILE" ]; then + echo "Status: NOT RUNNING (no PID file found)" + exit 0 +fi + +PID=$(cat "$PID_FILE") + +if ! ps -p $PID > /dev/null 2>&1; then + echo "Status: NOT RUNNING (stale PID file)" + echo "Cleaning up PID file..." + rm -f "$PID_FILE" + exit 0 +fi + +echo "Status: RUNNING" +echo "PID: $PID" +echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')" +echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%" +echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%" +echo "" + +# Check if log file exists and show last few lines +if [ -f "$LOG_FILE" ]; then + echo "Recent log entries (last 10 lines):" + echo "-" | tr '-' '-' | head -c 70 && echo + tail -n 10 "$LOG_FILE" +else + echo "Log file not found: $LOG_FILE" +fi + +echo "" +echo "=" | tr '=' '-' | head -c 70 && echo diff --git a/scripts/vllm-stop.sh b/scripts/vllm-stop.sh new file mode 100644 index 0000000..20bf1f3 --- /dev/null +++ b/scripts/vllm-stop.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# vLLM Server Stop Script for DGX Spark + +# Determine installation directory (where this script is located) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +PID_FILE="$SCRIPT_DIR/.vllm-server.pid" + +if [ ! -f "$PID_FILE" ]; then + echo "No vLLM server PID file found. Server may not be running." + exit 0 +fi + +PID=$(cat "$PID_FILE") + +if ! ps -p $PID > /dev/null 2>&1; then + echo "vLLM server (PID: $PID) is not running. Cleaning up PID file." + rm -f "$PID_FILE" + exit 0 +fi + +echo "Stopping vLLM server (PID: $PID)..." +kill $PID + +# Wait for process to terminate +for i in {1..10}; do + if ! ps -p $PID > /dev/null 2>&1; then + echo "OK: Server stopped successfully" + rm -f "$PID_FILE" + exit 0 + fi + sleep 1 +done + +# Force kill if still running +if ps -p $PID > /dev/null 2>&1; then + echo "Server did not stop gracefully. Force killing..." + kill -9 $PID + sleep 1 + if ! ps -p $PID > /dev/null 2>&1; then + echo "OK: Server force stopped" + rm -f "$PID_FILE" + else + echo "ERROR: Failed to stop server" + exit 1 + fi +fi