first commit
This commit is contained in:
77
.gitignore
vendored
Normal file
77
.gitignore
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environments
|
||||
.venv
|
||||
.vllm/
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
vllm-server.log
|
||||
*.out
|
||||
*.err
|
||||
|
||||
# Build artifacts
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
*.dylib
|
||||
*.dll
|
||||
CMakeCache.txt
|
||||
CMakeFiles/
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
|
||||
# CUDA
|
||||
*.ptx
|
||||
*.cubin
|
||||
|
||||
# Local installation directories
|
||||
triton/
|
||||
vllm/
|
||||
.cache/
|
||||
|
||||
# Temporary files
|
||||
tmp/
|
||||
temp/
|
||||
*.tmp
|
||||
*.bak
|
||||
|
||||
# Model downloads
|
||||
models/
|
||||
*.safetensors
|
||||
*.bin
|
||||
*.gguf
|
||||
380
CLUSTER.md
Normal file
380
CLUSTER.md
Normal file
@@ -0,0 +1,380 @@
|
||||
# vLLM Cluster Mode Setup for DGX Spark
|
||||
|
||||
This guide covers setting up multi-node vLLM deployment on DGX Spark systems using distributed inference.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Multiple DGX Spark systems with vLLM installed (use `install.sh` on each node)
|
||||
- All nodes on the same network with direct connectivity
|
||||
- SSH access between nodes (passwordless SSH recommended)
|
||||
- Same CUDA and vLLM versions across all nodes
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────┐
|
||||
│ spark-alpha │
|
||||
│ (Master/Head) │
|
||||
│ - API Server │
|
||||
│ - Request Router │
|
||||
│ - Model Weights │
|
||||
└──────────┬──────────┘
|
||||
│
|
||||
├─────────────────────┐
|
||||
│ │
|
||||
┌──────────▼──────────┐ ┌──────▼──────────┐
|
||||
│ spark-omega │ │ spark-gamma │
|
||||
│ (Worker 1) │ │ (Worker 2) │
|
||||
│ - Inference │ │ - Inference │
|
||||
│ - GPU Compute │ │ - GPU Compute │
|
||||
└─────────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
## Step 1: Install vLLM on All Nodes
|
||||
|
||||
Run the installer on each node:
|
||||
|
||||
```bash
|
||||
# On spark-alpha (master)
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||
|
||||
# On spark-omega (worker 1)
|
||||
ssh spark-omega.local
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||
|
||||
# On spark-gamma (worker 2)
|
||||
ssh spark-gamma.local
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||
```
|
||||
|
||||
## Step 2: Configure Network Settings
|
||||
|
||||
Ensure all nodes can communicate on the required ports:
|
||||
|
||||
- **8000**: vLLM API server (master only)
|
||||
- **29500**: PyTorch distributed backend (all nodes)
|
||||
- **Random ports**: Ray cluster communication
|
||||
|
||||
Open firewall if needed:
|
||||
|
||||
```bash
|
||||
# On all nodes
|
||||
sudo ufw allow 8000/tcp
|
||||
sudo ufw allow 29500/tcp
|
||||
sudo ufw allow 6379/tcp # Ray GCS
|
||||
sudo ufw allow 8265/tcp # Ray Dashboard
|
||||
```
|
||||
|
||||
## Step 3: Set Up Passwordless SSH (Optional but Recommended)
|
||||
|
||||
```bash
|
||||
# On master node
|
||||
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
|
||||
|
||||
# Copy to worker nodes
|
||||
ssh-copy-id spark-omega.local
|
||||
ssh-copy-id spark-gamma.local
|
||||
|
||||
# Verify
|
||||
ssh spark-omega.local "echo 'Connection successful'"
|
||||
ssh spark-gamma.local "echo 'Connection successful'"
|
||||
```
|
||||
|
||||
## Step 4: Start Ray Cluster
|
||||
|
||||
### On Master Node (spark-alpha)
|
||||
|
||||
```bash
|
||||
# Assuming vllm-install is in your home directory
|
||||
source ~/vllm-install/vllm_env.sh
|
||||
|
||||
# Start Ray head node
|
||||
ray start --head \
|
||||
--port=6379 \
|
||||
--dashboard-host=0.0.0.0 \
|
||||
--dashboard-port=8265 \
|
||||
--num-gpus=1
|
||||
|
||||
# Note the output: "To connect to this Ray cluster, use: ray start --address='MASTER_IP:6379'"
|
||||
```
|
||||
|
||||
### On Worker Nodes (spark-omega, spark-gamma)
|
||||
|
||||
```bash
|
||||
source ~/vllm-install/vllm_env.sh
|
||||
|
||||
# Replace MASTER_IP with spark-alpha's IP address
|
||||
ray start --address='MASTER_IP:6379' --num-gpus=1
|
||||
```
|
||||
|
||||
Verify cluster status:
|
||||
|
||||
```bash
|
||||
ray status
|
||||
```
|
||||
|
||||
You should see all nodes listed.
|
||||
|
||||
## Step 5: Start vLLM with Tensor Parallelism
|
||||
|
||||
### Method 1: Tensor Parallelism (Recommended for Large Models)
|
||||
|
||||
Tensor parallelism splits model layers across multiple GPUs.
|
||||
|
||||
```bash
|
||||
# On master node
|
||||
source ~/vllm-install/vllm_env.sh
|
||||
|
||||
vllm serve \
|
||||
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||
--tensor-parallel-size 2 \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000
|
||||
```
|
||||
|
||||
This will automatically distribute the model across 2 GPUs in the Ray cluster.
|
||||
|
||||
### Method 2: Pipeline Parallelism
|
||||
|
||||
Pipeline parallelism splits model stages across GPUs.
|
||||
|
||||
```bash
|
||||
vllm serve \
|
||||
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||
--pipeline-parallel-size 2 \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000
|
||||
```
|
||||
|
||||
### Method 3: Combined Parallelism
|
||||
|
||||
For very large models, combine tensor and pipeline parallelism:
|
||||
|
||||
```bash
|
||||
vllm serve \
|
||||
--model "meta-llama/Llama-3.1-405B-Instruct" \
|
||||
--tensor-parallel-size 4 \
|
||||
--pipeline-parallel-size 2 \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000
|
||||
```
|
||||
|
||||
## Step 6: Test Cluster Inference
|
||||
|
||||
```bash
|
||||
# Test from master node
|
||||
curl http://localhost:8000/v1/models
|
||||
|
||||
# Test from external machine
|
||||
curl http://spark-alpha.local:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
||||
"prompt": "Explain distributed inference in 3 sentences.",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
## Step 7: Monitor Cluster
|
||||
|
||||
### Ray Dashboard
|
||||
|
||||
Access at: http://spark-alpha.local:8265
|
||||
|
||||
Shows:
|
||||
- Node status and resources
|
||||
- Task execution
|
||||
- GPU utilization
|
||||
- Memory usage
|
||||
|
||||
### vLLM Metrics
|
||||
|
||||
```bash
|
||||
# On master node
|
||||
tail -f ~/vllm-install/vllm-server.log
|
||||
|
||||
# Check GPU usage across cluster
|
||||
ray exec 'nvidia-smi'
|
||||
```
|
||||
|
||||
### System Monitoring
|
||||
|
||||
```bash
|
||||
# Check Ray cluster status
|
||||
ray status
|
||||
|
||||
# Monitor GPU usage on specific node
|
||||
ssh spark-omega.local nvidia-smi -l 1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Workers Not Connecting
|
||||
|
||||
**Problem**: Workers can't connect to Ray head node
|
||||
|
||||
**Solutions**:
|
||||
1. Check firewall: `sudo ufw status`
|
||||
2. Verify head node IP: `ray status` on master
|
||||
3. Check network connectivity: `ping spark-alpha.local`
|
||||
4. Ensure same Ray version on all nodes: `ray --version`
|
||||
|
||||
### OOM Errors with Large Models
|
||||
|
||||
**Problem**: Out of memory when loading large models
|
||||
|
||||
**Solutions**:
|
||||
1. Increase tensor parallelism: `--tensor-parallel-size 4`
|
||||
2. Reduce memory utilization: `--gpu-memory-utilization 0.8`
|
||||
3. Enable CPU offloading: `--cpu-offload-gb 8`
|
||||
4. Use quantization: `--quantization awq` or `--quantization gptq`
|
||||
|
||||
### Model Loading Hangs
|
||||
|
||||
**Problem**: Model download/loading takes forever
|
||||
|
||||
**Solutions**:
|
||||
1. Pre-download model on all nodes:
|
||||
```bash
|
||||
# On each node
|
||||
python -c "from transformers import AutoModel; AutoModel.from_pretrained('meta-llama/Llama-3.1-70B-Instruct')"
|
||||
```
|
||||
2. Use shared storage (NFS) for model cache
|
||||
3. Check network bandwidth between nodes
|
||||
|
||||
### Uneven GPU Utilization
|
||||
|
||||
**Problem**: Some GPUs idle while others maxed out
|
||||
|
||||
**Solutions**:
|
||||
1. Verify tensor parallel configuration
|
||||
2. Check Ray resource allocation: `ray status`
|
||||
3. Ensure balanced request distribution
|
||||
4. Monitor with: `ray exec 'nvidia-smi'`
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Custom Ray Resources
|
||||
|
||||
Assign custom resources to nodes for fine-grained control:
|
||||
|
||||
```bash
|
||||
# On worker with high memory
|
||||
ray start --address='MASTER_IP:6379' \
|
||||
--num-gpus=1 \
|
||||
--resources='{"highmem": 1}'
|
||||
|
||||
# Use in vLLM
|
||||
vllm serve --model "..." --placement-group-resources='{"highmem": 1}'
|
||||
```
|
||||
|
||||
### Distributed Model Cache
|
||||
|
||||
Share model weights via NFS to avoid redundant downloads:
|
||||
|
||||
```bash
|
||||
# On NFS server (e.g., master)
|
||||
sudo apt install nfs-kernel-server
|
||||
echo "$HOME/.cache/huggingface *(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
|
||||
sudo exportfs -a
|
||||
|
||||
# On workers
|
||||
sudo apt install nfs-common
|
||||
sudo mkdir -p $HOME/.cache/huggingface
|
||||
sudo mount spark-alpha.local:$HOME/.cache/huggingface $HOME/.cache/huggingface
|
||||
```
|
||||
|
||||
### Load Balancing with nginx
|
||||
|
||||
For production deployments, use nginx to load balance across multiple vLLM instances:
|
||||
|
||||
```nginx
|
||||
upstream vllm_cluster {
|
||||
least_conn;
|
||||
server spark-alpha.local:8000;
|
||||
server spark-omega.local:8000;
|
||||
server spark-gamma.local:8000;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
location / {
|
||||
proxy_pass http://vllm_cluster;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Cluster Management Scripts
|
||||
|
||||
### Start Cluster
|
||||
|
||||
Create `start-cluster.sh`:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Start Ray cluster on all nodes
|
||||
|
||||
ssh spark-alpha.local "source ~/vllm-install/vllm_env.sh && ray start --head --port=6379"
|
||||
sleep 5
|
||||
|
||||
MASTER_IP=$(ssh spark-alpha.local "hostname -I | awk '{print \$1}'")
|
||||
|
||||
ssh spark-omega.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
|
||||
ssh spark-gamma.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
|
||||
|
||||
echo "Cluster started. Check status with: ray status"
|
||||
```
|
||||
|
||||
### Stop Cluster
|
||||
|
||||
Create `stop-cluster.sh`:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Stop Ray cluster on all nodes
|
||||
|
||||
for node in spark-alpha.local spark-omega.local spark-gamma.local; do
|
||||
echo "Stopping Ray on $node..."
|
||||
ssh $node "ray stop --force"
|
||||
done
|
||||
|
||||
echo "Cluster stopped."
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### For Maximum Throughput
|
||||
|
||||
```bash
|
||||
vllm serve \
|
||||
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-num-seqs 256 \
|
||||
--max-num-batched-tokens 8192 \
|
||||
--gpu-memory-utilization 0.95
|
||||
```
|
||||
|
||||
### For Low Latency
|
||||
|
||||
```bash
|
||||
vllm serve \
|
||||
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-num-seqs 32 \
|
||||
--disable-log-requests
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- [vLLM Distributed Inference](https://docs.vllm.ai/en/latest/serving/distributed_serving.html)
|
||||
- [Ray Cluster Setup](https://docs.ray.io/en/latest/cluster/getting-started.html)
|
||||
- [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html)
|
||||
|
||||
## Support
|
||||
|
||||
For issues specific to DGX Spark cluster setup, please open an issue on GitHub.
|
||||
134
CRITICAL_FIX_ANALYSIS.md
Normal file
134
CRITICAL_FIX_ANALYSIS.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# Critical Blackwell GB10 Fixes for vLLM
|
||||
|
||||
## Overview
|
||||
|
||||
Three critical fixes are required for vLLM on Blackwell GB10 (sm_121a) GPUs with CUDA 13.0+:
|
||||
|
||||
1. **CMakeLists.txt SM120 Support** - Add missing architecture
|
||||
2. **vLLM Commit Version** - Use commit with Blackwell/Triton fixes
|
||||
3. **Triton Version Pinning** - Use tested working commit
|
||||
|
||||
## Fix 1: CMakeLists.txt SM120 Support
|
||||
|
||||
### Root Cause
|
||||
|
||||
vLLM v0.11.1rc3 CMakeLists.txt has **incomplete architecture support** for Blackwell GB10 (sm_121a) MOE kernels when using CUDA 13.0+.
|
||||
|
||||
## The Problem
|
||||
|
||||
For CUDA 13.0+, the code uses these branches:
|
||||
- **Line 490**: Regular MOE kernels
|
||||
- **Line 671**: Grouped MM MOE kernels
|
||||
|
||||
Original v0.11.1rc3:
|
||||
```cmake
|
||||
# Line 490
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
|
||||
# Line 671
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
```
|
||||
|
||||
**BOTH lines are missing `12.0f` (SM120) support!**
|
||||
|
||||
## The Fix
|
||||
|
||||
Both lines need `12.0f` added:
|
||||
```cmake
|
||||
# Line 490
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
|
||||
# Line 671
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
```
|
||||
|
||||
## Error Symptoms
|
||||
|
||||
Without this fix:
|
||||
```
|
||||
ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100RN2at6TensorERKS0_S3_S3_S3_S3_S3_S3_S3_S3_bb
|
||||
```
|
||||
|
||||
The MOE kernels for SM100/SM120 aren't compiled, causing import failures.
|
||||
|
||||
## Why install.sh Works
|
||||
|
||||
The sed command on line 323:
|
||||
```bash
|
||||
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
|
||||
```
|
||||
|
||||
This replaces **ALL** occurrences, fixing both lines 490 and 671 in one command.
|
||||
|
||||
## Verified Solution
|
||||
|
||||
Tested on NVIDIA DGX Spark with Blackwell GB10, CUDA 13.0:
|
||||
- [OK] Line 490 fixed: `"10.0f;11.0f;12.0f"`
|
||||
- [OK] Line 671 fixed: `"10.0f;11.0f;12.0f"`
|
||||
- [OK] vLLM imports successfully
|
||||
- [OK] No cutlass_moe_mm_sm100 symbol errors
|
||||
- [OK] Build time: ~19 minutes
|
||||
|
||||
## Fix 2: vLLM Commit Version
|
||||
|
||||
### Issue
|
||||
|
||||
vLLM tag `v0.11.1rc3` lacks critical Triton/PyTorch Inductor fixes for Blackwell.
|
||||
|
||||
### Solution
|
||||
|
||||
Use commit `66a168a197ba214a5b70a74fa2e713c9eeb3251a` (6 commits ahead of v0.11.1rc3):
|
||||
- Contains Triton JIT compilation fixes
|
||||
- Includes PyTorch Inductor optimizations for Blackwell
|
||||
- Adds proper backend registration handling
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
cd vllm
|
||||
git checkout 66a168a197ba214a5b70a74fa2e713c9eeb3251a
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
## Fix 3: Triton Version Pinning
|
||||
|
||||
### Issue
|
||||
|
||||
Latest Triton main branch (as of late October 2025) has intermittent JITFunction compilation issues with PyTorch Inductor on Blackwell.
|
||||
|
||||
### Solution
|
||||
|
||||
Pin to tested working commit: `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` (October 25, 2025)
|
||||
- Verified stable with Blackwell GB10
|
||||
- Passes all compilation tests
|
||||
- No JITFunction.constexprs errors
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
cd triton
|
||||
git checkout 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
|
||||
git submodule update --init --recursive
|
||||
python -m pip install --no-build-isolation -v .
|
||||
```
|
||||
|
||||
## Complete Verified Configuration
|
||||
|
||||
| Component | Version/Commit | Notes |
|
||||
|-----------|---------------|-------|
|
||||
| **vLLM** | `66a168a197ba214a5b70a74fa2e713c9eeb3251a` | 6 commits ahead of v0.11.1rc3 |
|
||||
| **Triton** | `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` | October 25, 2025 |
|
||||
| **PyTorch** | `2.9.0+cu130` | From vLLM requirements |
|
||||
| **CUDA** | `13.0` (V13.0.88) | System CUDA |
|
||||
| **Python** | `3.12.3` | |
|
||||
|
||||
## Testing
|
||||
|
||||
Verified working with:
|
||||
```bash
|
||||
python -c "from vllm import LLM, SamplingParams; \
|
||||
llm = LLM(model='Qwen/Qwen2.5-0.5B-Instruct', max_model_len=512); \
|
||||
print(llm.generate(['Hello'], SamplingParams(max_tokens=20)))"
|
||||
```
|
||||
|
||||
**All tests pass**: Import, compilation, CUDA graphs, and text generation all work correctly.
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 DGX Spark Community
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
312
README.md
Normal file
312
README.md
Normal file
@@ -0,0 +1,312 @@
|
||||
# vLLM Setup for NVIDIA DGX Spark (Blackwell GB10)
|
||||
|
||||
**One-command installation** of vLLM for NVIDIA DGX Spark systems with GB10 GPUs (Blackwell architecture, sm_121).
|
||||
|
||||
This repository provides a dgx-spark tested, ready setup script that handles all the complexities of building vLLM on the DGX Spark platform, including:
|
||||
- CUDA 13.0 support with Blackwell-specific optimizations
|
||||
- Critical fixes for SM100/SM120 MOE kernel compilation
|
||||
- Triton 3.5.0 from main branch (required for sm_121a support)
|
||||
- PyTorch 2.9.0 with CUDA 13.0 bindings
|
||||
- All necessary build fixes and workarounds
|
||||
|
||||
## Quick Start
|
||||
|
||||
**One-command installation** - installs to `./vllm-install` in your current directory:
|
||||
|
||||
```bash
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||
```
|
||||
|
||||
Or specify a custom directory:
|
||||
|
||||
```bash
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash -s -- --install-dir ~/my/custom/path
|
||||
```
|
||||
|
||||
**Installation time:** ~20-30 minutes (mostly compilation)
|
||||
|
||||
### Alternative: Clone and Install
|
||||
|
||||
```bash
|
||||
git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
|
||||
cd dgx-spark-vllm-setup
|
||||
./install.sh
|
||||
```
|
||||
|
||||
### Installation Options
|
||||
|
||||
```bash
|
||||
./install.sh [OPTIONS]
|
||||
|
||||
Options:
|
||||
--install-dir DIR Installation directory (default: ./vllm-install)
|
||||
--vllm-version TAG vLLM git tag/branch (default: v0.11.1rc3)
|
||||
--python-version VER Python version (default: 3.12)
|
||||
--skip-tests Skip post-installation tests
|
||||
--help Show help message
|
||||
```
|
||||
|
||||
## System Requirements
|
||||
|
||||
- **Hardware:** NVIDIA DGX Spark with GB10 GPU (Blackwell sm_121)
|
||||
- **OS:** Ubuntu 22.04+ (tested on Linux 6.11.0 ARM64)
|
||||
- **CUDA:** 13.0 or later (driver 580.95.05+)
|
||||
- **Disk Space:** ~50GB free
|
||||
- **RAM:** 8GB+ recommended during build
|
||||
|
||||
## What Gets Installed
|
||||
|
||||
Installed to `./vllm-install` (or your custom directory):
|
||||
|
||||
- **Python 3.12** virtual environment at `.vllm/`
|
||||
- **PyTorch 2.9.0+cu130** with full CUDA 13.0 support
|
||||
- **Triton 3.5.0+git** from main branch (pre-release with Blackwell support)
|
||||
- **vLLM 0.11.1rc3+** with all Blackwell-specific patches
|
||||
- **Helper scripts** for managing vLLM server
|
||||
- **Environment activation** script (`vllm_env.sh`)
|
||||
|
||||
## Usage
|
||||
|
||||
All examples assume you're in the installation directory (default: `./vllm-install`).
|
||||
|
||||
### Activate Environment
|
||||
|
||||
```bash
|
||||
cd vllm-install
|
||||
source vllm_env.sh
|
||||
```
|
||||
|
||||
### Start vLLM Server
|
||||
|
||||
```bash
|
||||
./vllm-serve.sh # Default: Qwen2.5-0.5B on port 8000
|
||||
./vllm-serve.sh "facebook/opt-125m" 8001 # Custom model and port
|
||||
```
|
||||
|
||||
### Check Server Status
|
||||
|
||||
```bash
|
||||
./vllm-status.sh
|
||||
```
|
||||
|
||||
### Stop Server
|
||||
|
||||
```bash
|
||||
./vllm-stop.sh
|
||||
```
|
||||
|
||||
### Test API
|
||||
|
||||
```bash
|
||||
# List models
|
||||
curl http://localhost:8000/v1/models
|
||||
|
||||
# Generate completion
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"prompt": "Hello, how are you?",
|
||||
"max_tokens": 50
|
||||
}'
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.9
|
||||
)
|
||||
|
||||
prompts = ["Tell me about DGX Spark"]
|
||||
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
print(outputs[0].outputs[0].text)
|
||||
```
|
||||
|
||||
## Critical Fixes Applied
|
||||
|
||||
This installer automatically applies the following critical fixes:
|
||||
|
||||
### 1. CMakeLists.txt SM100/SM120 MOE Kernel Fix
|
||||
|
||||
**Issue:** vLLM's MOE kernels for SM100/SM120 Blackwell architectures were incomplete
|
||||
**Fix:** Added `12.0f` and `12.1a` to SCALED_MM_ARCHS in CMakeLists.txt
|
||||
|
||||
```cmake
|
||||
# CUDA 13.0+ path (line ~671)
|
||||
# Before
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
# After
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
|
||||
# Older CUDA path (line ~673)
|
||||
# Before
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
# After
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a" "${CUDA_ARCHS}")
|
||||
```
|
||||
|
||||
### 2. pyproject.toml License Field Format
|
||||
|
||||
**Issue:** Newer setuptools requires structured license format
|
||||
**Fix:** Convert license string to dict format in both vLLM and flashinfer-python
|
||||
|
||||
```toml
|
||||
# Before
|
||||
license = "Apache-2.0"
|
||||
license-files = ["LICENSE"]
|
||||
|
||||
# After
|
||||
license = {text = "Apache-2.0"}
|
||||
```
|
||||
|
||||
**Applied to:**
|
||||
- vLLM's pyproject.toml
|
||||
- flashinfer-python's pyproject.toml (patched during build)
|
||||
|
||||
### 3. GPT-OSS Triton MOE Kernels for Qwen3/gpt-oss Support
|
||||
|
||||
**Issue:** vLLM's GPT-OSS MOE kernel implementation uses deprecated Triton routing API
|
||||
**Fix:** Update to new Triton kernel API (topk and SparseMatrix)
|
||||
|
||||
**Changes:**
|
||||
- Replace deprecated `routing()` with `triton_topk()`
|
||||
- Replace deprecated `routing_from_bitmatrix()` with `SparseMatrix()`
|
||||
- Add support for `GatherIndx`, `ScatterIndx`, and new ragged tensor metadata
|
||||
|
||||
**Enables support for:**
|
||||
- Qwen3 models with MOE architecture
|
||||
- gpt-oss models using Triton kernels
|
||||
- Latest Triton kernel optimizations for Blackwell
|
||||
|
||||
### 4. Triton Main Branch Requirement
|
||||
|
||||
**Issue:** Official Triton 3.5.0 release has bugs with sm_121a
|
||||
**Fix:** Build Triton from main branch with latest Blackwell fixes
|
||||
|
||||
## Architecture-Specific Configuration
|
||||
|
||||
The installer sets these critical environment variables:
|
||||
|
||||
```bash
|
||||
TORCH_CUDA_ARCH_LIST=12.1a # Blackwell sm_121
|
||||
VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer MOE optimization
|
||||
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler
|
||||
TIKTOKEN_CACHE_DIR=$INSTALL_DIR/.tiktoken_cache # Cache tiktoken encodings locally
|
||||
```
|
||||
|
||||
## Cluster Mode Setup
|
||||
|
||||
To set up multi-node vLLM cluster:
|
||||
|
||||
1. Run this installer on all nodes
|
||||
2. Follow [CLUSTER.md](./CLUSTER.md) for configuration
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Build Fails with "TypeError: can only concatenate str (not 'NoneType') to str"
|
||||
|
||||
This is a known Triton editable-mode build issue. The installer works around this by:
|
||||
- Building Triton in non-editable mode
|
||||
- Or copying pre-built Triton from another node
|
||||
|
||||
### Symbol Error: cutlass_moe_mm_sm100
|
||||
|
||||
**Symptom:** `ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100`
|
||||
**Solution:** Ensure CMakeLists.txt fix is applied (done automatically by installer)
|
||||
|
||||
### PyTorch CUDA Capability Warning
|
||||
|
||||
**Symptom:** Warning about GPU capability 12.1 vs PyTorch max 12.0
|
||||
**Status:** Harmless warning - PyTorch 2.9.0+cu130 works correctly with GB10
|
||||
|
||||
### ImportError: No module named 'vllm'
|
||||
|
||||
**Solution:**
|
||||
```bash
|
||||
source vllm-install/vllm_env.sh
|
||||
python -c "import vllm; print(vllm.__version__)"
|
||||
```
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
vllm-install/
|
||||
├── .vllm/ # Python virtual environment
|
||||
├── vllm/ # vLLM source (editable install)
|
||||
├── triton/ # Triton source
|
||||
├── vllm_env.sh # Environment activation script
|
||||
├── vllm-serve.sh # Start server
|
||||
├── vllm-stop.sh # Stop server
|
||||
├── vllm-status.sh # Check status
|
||||
└── vllm-server.log # Server logs
|
||||
```
|
||||
|
||||
## Manual Installation
|
||||
|
||||
If you prefer to understand each step:
|
||||
|
||||
```bash
|
||||
# 1. Install uv package manager
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
|
||||
# 2. Create installation directory and Python virtual environment
|
||||
mkdir -p vllm-install && cd vllm-install
|
||||
uv venv .vllm --python 3.12
|
||||
source .vllm/bin/activate
|
||||
|
||||
# 3. Install PyTorch with CUDA 13.0
|
||||
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
||||
|
||||
# 4. Clone and build Triton from main
|
||||
git clone https://github.com/triton-lang/triton.git
|
||||
cd triton
|
||||
uv pip install pip cmake ninja pybind11
|
||||
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas python -m pip install --no-build-isolation .
|
||||
|
||||
# 5. Install additional dependencies
|
||||
uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
|
||||
|
||||
# 6. Clone vLLM
|
||||
cd ..
|
||||
git clone --recursive https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
git checkout v0.11.1rc3
|
||||
|
||||
# 7. Apply fixes (see scripts/apply-fixes.sh)
|
||||
# 8. Build vLLM (see install.sh for full process)
|
||||
```
|
||||
|
||||
## Version Information
|
||||
|
||||
- **vLLM:** 0.11.1rc4.dev6+g66a168a19.d20251026
|
||||
- **PyTorch:** 2.9.0+cu130
|
||||
- **Triton:** 3.5.0+git4caa0328
|
||||
- **CUDA:** 13.0
|
||||
- **Python:** 3.12.3
|
||||
- **Target Architecture:** sm_121 (Blackwell GB10)
|
||||
|
||||
## Contributing
|
||||
|
||||
Issues and pull requests welcome! This installer is maintained by the DGX Spark community.
|
||||
|
||||
## References
|
||||
|
||||
- [NVIDIA Forum Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
|
||||
- [vLLM GitHub](https://github.com/vllm-project/vllm)
|
||||
- [Triton GitHub](https://github.com/triton-lang/triton)
|
||||
|
||||
## License
|
||||
|
||||
MIT License - See [LICENSE](./LICENSE)
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
Developed and tested on NVIDIA DGX Spark systems. Special thanks to the vLLM and Triton communities.
|
||||
246
SUMMARY.md
Normal file
246
SUMMARY.md
Normal file
@@ -0,0 +1,246 @@
|
||||
# Repository Summary
|
||||
|
||||
## Overview
|
||||
|
||||
This repository provides a **production-ready, one-command installation** of vLLM for NVIDIA DGX Spark systems with Blackwell GB10 GPUs (sm_121 architecture).
|
||||
|
||||
## What's Included
|
||||
|
||||
### Core Files
|
||||
|
||||
1. **install.sh** (500+ lines)
|
||||
- Fully automated installation script
|
||||
- Pre-flight system checks
|
||||
- 8-step installation pipeline
|
||||
- Post-installation testing
|
||||
- Command-line argument support
|
||||
|
||||
2. **README.md** (300+ lines)
|
||||
- Quick start guide
|
||||
- System requirements
|
||||
- Usage examples
|
||||
- Critical fixes documentation
|
||||
- Troubleshooting guide
|
||||
|
||||
3. **CLUSTER.md** (400+ lines)
|
||||
- Multi-node setup instructions
|
||||
- Ray cluster configuration
|
||||
- Tensor/pipeline parallelism
|
||||
- Performance tuning
|
||||
- Load balancing examples
|
||||
|
||||
4. **requirements.txt**
|
||||
- Complete dependency list
|
||||
- PyTorch 2.9.0+cu130
|
||||
- All required packages
|
||||
|
||||
### Helper Scripts (scripts/)
|
||||
|
||||
- **vllm-serve.sh** - Start vLLM server with configurable model/port
|
||||
- **vllm-stop.sh** - Gracefully stop server
|
||||
- **vllm-status.sh** - Check server status and logs
|
||||
|
||||
### Examples (examples/)
|
||||
|
||||
- **basic_inference.py** - Simple Python API usage
|
||||
- **api_client.py** - OpenAI-compatible REST API client
|
||||
- **README.md** - Usage instructions and API examples
|
||||
|
||||
### Configuration
|
||||
|
||||
- **.gitignore** - Excludes build artifacts, venvs, logs
|
||||
- **LICENSE** - MIT license
|
||||
|
||||
## Technical Specifications
|
||||
|
||||
### Target Platform
|
||||
- **Hardware:** NVIDIA DGX Spark with GB10 GPU
|
||||
- **Architecture:** Blackwell sm_121 (compute capability 12.1)
|
||||
- **OS:** Ubuntu 22.04+ ARM64
|
||||
- **CUDA:** 13.0+ (driver 580.95.05+)
|
||||
|
||||
### Software Stack
|
||||
- **Python:** 3.12.3
|
||||
- **PyTorch:** 2.9.0+cu130
|
||||
- **Triton:** 3.5.0+git (from main branch)
|
||||
- **vLLM:** 0.11.1rc4+
|
||||
- **Package Manager:** uv (fast Python package installer)
|
||||
|
||||
### Critical Fixes Applied
|
||||
|
||||
1. **CMakeLists.txt (line 671)**
|
||||
- Added `12.0f` to SCALED_MM_ARCHS for SM100 MOE kernels
|
||||
- Enables Blackwell GPU compilation
|
||||
|
||||
2. **pyproject.toml**
|
||||
- Changed `license = "Apache-2.0"` to `license = {text = "Apache-2.0"}`
|
||||
- Removed deprecated `license-files` field
|
||||
- Compatible with setuptools 77.0+
|
||||
|
||||
3. **Triton Build**
|
||||
- Must use main branch (not release 3.5.0)
|
||||
- Non-editable install to avoid setuptools bug
|
||||
- Custom PTXAS path for CUDA integration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
TORCH_CUDA_ARCH_LIST=12.1a # Blackwell architecture
|
||||
VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer optimization
|
||||
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler
|
||||
```
|
||||
|
||||
## Installation Overview
|
||||
|
||||
The `install.sh` script performs these steps:
|
||||
|
||||
1. **Pre-flight Checks**
|
||||
- Verify ARM64 architecture
|
||||
- Check NVIDIA GPU (GB10)
|
||||
- Validate CUDA 13.0+
|
||||
- Ensure 50GB+ disk space
|
||||
|
||||
2. **Install uv Package Manager**
|
||||
- Fast Python package installer
|
||||
- Required for efficient dependency resolution
|
||||
|
||||
3. **Create Virtual Environment**
|
||||
- Python 3.12 virtual environment
|
||||
- Isolated from system packages
|
||||
|
||||
4. **Install PyTorch**
|
||||
- PyTorch 2.9.0 with CUDA 13.0 bindings
|
||||
- Verify CUDA availability
|
||||
|
||||
5. **Build Triton**
|
||||
- Clone from GitHub main branch
|
||||
- Build with Blackwell support
|
||||
- Non-editable install
|
||||
|
||||
6. **Install Dependencies**
|
||||
- xgrammar, setuptools-scm
|
||||
- apache-tvm-ffi (prerelease)
|
||||
- Build tools
|
||||
|
||||
7. **Clone and Fix vLLM**
|
||||
- Clone v0.11.1rc3
|
||||
- Apply CMakeLists.txt fix
|
||||
- Apply pyproject.toml fix
|
||||
- Configure use_existing_torch
|
||||
|
||||
8. **Build vLLM**
|
||||
- 15-20 minute compilation
|
||||
- All CUDA kernels for Blackwell
|
||||
- Editable install for development
|
||||
|
||||
9. **Create Helper Scripts**
|
||||
- Environment activation script
|
||||
- Server management scripts
|
||||
- Logging configuration
|
||||
|
||||
10. **Post-Installation Tests**
|
||||
- Import vLLM
|
||||
- Check CUDA availability
|
||||
- Verify GPU detection
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# One-command installation
|
||||
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||
|
||||
# Or clone and run
|
||||
git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
|
||||
cd dgx-spark-vllm-setup
|
||||
./install.sh
|
||||
|
||||
# Activate environment (assuming installation in current directory)
|
||||
cd vllm-install
|
||||
source vllm_env.sh
|
||||
|
||||
# Start server
|
||||
./vllm-serve.sh
|
||||
|
||||
# Test API
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
## Repository Structure
|
||||
|
||||
```
|
||||
dgx-spark-vllm-setup/
|
||||
├── README.md # Main documentation
|
||||
├── CLUSTER.md # Multi-node setup guide
|
||||
├── SUMMARY.md # This file
|
||||
├── LICENSE # MIT license
|
||||
├── .gitignore # Git ignore rules
|
||||
├── install.sh # Main installation script
|
||||
├── requirements.txt # Python dependencies
|
||||
├── scripts/
|
||||
│ ├── vllm-serve.sh # Start vLLM server
|
||||
│ ├── vllm-stop.sh # Stop server
|
||||
│ └── vllm-status.sh # Check status
|
||||
└── examples/
|
||||
├── README.md # Examples documentation
|
||||
├── basic_inference.py # Python API example
|
||||
└── api_client.py # REST API example
|
||||
```
|
||||
|
||||
## Known Issues & Workarounds
|
||||
|
||||
### Triton Editable Build Fails
|
||||
**Error:** `TypeError: can only concatenate str (not 'NoneType') to str`
|
||||
**Workaround:** Use non-editable install (`uv pip install --no-build-isolation .`)
|
||||
|
||||
### PyTorch CUDA Capability Warning
|
||||
**Warning:** GPU capability 12.1 vs PyTorch max 12.0
|
||||
**Status:** Harmless - PyTorch 2.9.0+cu130 works correctly with GB10
|
||||
|
||||
### apache-tvm-ffi Prerelease
|
||||
**Error:** `No solution found when resolving dependencies`
|
||||
**Fix:** Use `--prerelease=allow` flag with uv pip install
|
||||
|
||||
## Testing Status
|
||||
|
||||
- [OK] Single-node installation on spark-alpha.local
|
||||
- [OK] Single-node installation on spark-omega.local
|
||||
- [OK] vLLM server startup and API functionality
|
||||
- [OK] Model inference (Qwen/Qwen2.5-0.5B-Instruct)
|
||||
- [IN PROGRESS] Multi-node cluster mode (documented, not yet tested)
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- [ ] Add cluster mode testing results
|
||||
- [ ] Include performance benchmarks
|
||||
- [ ] Add Dockerfile for containerized deployment
|
||||
- [ ] Create Ansible playbook for multi-node automation
|
||||
- [ ] Add monitoring and logging setup (Prometheus/Grafana)
|
||||
- [ ] Include model quantization examples (AWQ, GPTQ)
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions welcome! Please open issues or pull requests on GitHub.
|
||||
|
||||
## Community & Support
|
||||
|
||||
- **GitHub Issues:** Report bugs and feature requests
|
||||
- **NVIDIA Forum:** [DGX Spark vLLM Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
|
||||
- **vLLM Docs:** [Official Documentation](https://docs.vllm.ai/)
|
||||
|
||||
## License
|
||||
|
||||
MIT License - See LICENSE file for details.
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
Developed and tested on NVIDIA DGX Spark systems. Special thanks to:
|
||||
- vLLM project team
|
||||
- Triton compiler team
|
||||
- NVIDIA DGX Spark community
|
||||
- Claude Code (AI assistant) for documentation automation
|
||||
|
||||
---
|
||||
|
||||
**Version:** 1.0.0
|
||||
**Last Updated:** 2025-10-26
|
||||
**Tested On:** DGX Spark with GB10, CUDA 13.0, Ubuntu 22.04 ARM64
|
||||
225
examples/README.md
Normal file
225
examples/README.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# vLLM Examples for DGX Spark
|
||||
|
||||
This directory contains example scripts demonstrating various ways to use vLLM on DGX Spark systems.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Ensure vLLM is installed and the environment is activated:
|
||||
|
||||
```bash
|
||||
# Assuming vllm-install is in your home directory
|
||||
source ~/vllm-install/vllm_env.sh
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### 1. Basic Inference (`basic_inference.py`)
|
||||
|
||||
Simple text generation using the vLLM Python API.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python basic_inference.py
|
||||
```
|
||||
|
||||
**What it demonstrates:**
|
||||
- Loading a model with vLLM
|
||||
- Configuring sampling parameters
|
||||
- Generating multiple completions
|
||||
- Batch processing
|
||||
|
||||
### 2. API Client (`api_client.py`)
|
||||
|
||||
Using vLLM's OpenAI-compatible REST API.
|
||||
|
||||
**Prerequisites:**
|
||||
Start the vLLM server first:
|
||||
```bash
|
||||
cd ~/vllm-install
|
||||
./vllm-serve.sh
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python api_client.py
|
||||
```
|
||||
|
||||
**What it demonstrates:**
|
||||
- Listing available models
|
||||
- Simple text completion
|
||||
- Chat completion
|
||||
- Streaming responses
|
||||
- HTTP API interaction
|
||||
|
||||
### 3. Batch Processing (`batch_processing.py`)
|
||||
|
||||
Efficient processing of large batches of prompts.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python batch_processing.py
|
||||
```
|
||||
|
||||
**What it demonstrates:**
|
||||
- High-throughput batch inference
|
||||
- Dynamic batching
|
||||
- Memory-efficient processing
|
||||
- Performance monitoring
|
||||
|
||||
## Customization
|
||||
|
||||
### Change Model
|
||||
|
||||
Edit the model name in any example:
|
||||
|
||||
```python
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct", # Change this
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.9
|
||||
)
|
||||
```
|
||||
|
||||
### Adjust Sampling Parameters
|
||||
|
||||
Modify `SamplingParams` for different generation behavior:
|
||||
|
||||
```python
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7, # Lower = more deterministic (0.0-1.0)
|
||||
top_p=0.95, # Nucleus sampling threshold
|
||||
max_tokens=100, # Maximum tokens to generate
|
||||
top_k=50, # Top-k sampling
|
||||
repetition_penalty=1.1 # Penalize repetition
|
||||
)
|
||||
```
|
||||
|
||||
### GPU Memory Management
|
||||
|
||||
Adjust memory utilization:
|
||||
|
||||
```python
|
||||
llm = LLM(
|
||||
model="...",
|
||||
gpu_memory_utilization=0.9, # Use 90% of GPU memory (0.0-1.0)
|
||||
max_model_len=2048 # Maximum sequence length
|
||||
)
|
||||
```
|
||||
|
||||
## API Server Examples
|
||||
|
||||
### cURL Examples
|
||||
|
||||
**List models:**
|
||||
```bash
|
||||
curl http://localhost:8000/v1/models
|
||||
```
|
||||
|
||||
**Simple completion:**
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"prompt": "The meaning of life is",
|
||||
"max_tokens": 50,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
**Chat completion:**
|
||||
```bash
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is DGX Spark?"}
|
||||
],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
**Streaming completion:**
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"prompt": "Write a story about",
|
||||
"max_tokens": 100,
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
## Tested Models
|
||||
|
||||
These models work well on DGX Spark GB10:
|
||||
|
||||
- `Qwen/Qwen2.5-0.5B-Instruct` (small, fast)
|
||||
- `Qwen/Qwen2.5-7B-Instruct` (balanced)
|
||||
- `meta-llama/Llama-3.1-8B-Instruct` (high quality)
|
||||
- `meta-llama/Llama-3.1-70B-Instruct` (requires tensor parallelism)
|
||||
|
||||
## Performance Tips
|
||||
|
||||
1. **Use GPU memory efficiently:**
|
||||
- Set `gpu_memory_utilization=0.95` for maximum throughput
|
||||
- Lower for models close to GPU memory limit
|
||||
|
||||
2. **Batch processing:**
|
||||
- Process multiple prompts together
|
||||
- vLLM automatically optimizes batch sizes
|
||||
|
||||
3. **Quantization:**
|
||||
- For larger models, use quantization:
|
||||
```python
|
||||
llm = LLM(model="...", quantization="awq")
|
||||
```
|
||||
|
||||
4. **Tensor parallelism:**
|
||||
- For models > 20GB, use multiple GPUs:
|
||||
```python
|
||||
llm = LLM(model="...", tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Out of Memory
|
||||
|
||||
Reduce `max_model_len` or `gpu_memory_utilization`:
|
||||
|
||||
```python
|
||||
llm = LLM(
|
||||
model="...",
|
||||
gpu_memory_utilization=0.8,
|
||||
max_model_len=2048
|
||||
)
|
||||
```
|
||||
|
||||
### Slow Generation
|
||||
|
||||
Check if model is loaded correctly:
|
||||
|
||||
```python
|
||||
python -c "import vllm; print(vllm.__version__)"
|
||||
nvidia-smi # Check GPU utilization
|
||||
```
|
||||
|
||||
### Connection Refused (API)
|
||||
|
||||
Ensure server is running:
|
||||
|
||||
```bash
|
||||
cd ~/vllm-install
|
||||
./vllm-status.sh
|
||||
```
|
||||
|
||||
## More Resources
|
||||
|
||||
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||
- [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
|
||||
- [Main README](../README.md)
|
||||
- [Cluster Setup](../CLUSTER.md)
|
||||
160
examples/api_client.py
Normal file
160
examples/api_client.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
vLLM OpenAI-Compatible API Client Example
|
||||
Demonstrates using vLLM's OpenAI-compatible API endpoints
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
class VLLMClient:
|
||||
"""Simple client for vLLM OpenAI-compatible API"""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:8000"):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
|
||||
def list_models(self) -> List[Dict]:
|
||||
"""List available models"""
|
||||
response = requests.get(f"{self.base_url}/v1/models")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def complete(
|
||||
self,
|
||||
prompt: str,
|
||||
model: str = None,
|
||||
max_tokens: int = 100,
|
||||
temperature: float = 0.7,
|
||||
stream: bool = False
|
||||
) -> Dict:
|
||||
"""Generate completion"""
|
||||
|
||||
# Get model name if not specified
|
||||
if model is None:
|
||||
models = self.list_models()
|
||||
model = models['data'][0]['id']
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": stream
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/completions",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
return response.json()
|
||||
|
||||
def chat(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
model: str = None,
|
||||
max_tokens: int = 100,
|
||||
temperature: float = 0.7,
|
||||
stream: bool = False
|
||||
) -> Dict:
|
||||
"""Generate chat completion"""
|
||||
|
||||
# Get model name if not specified
|
||||
if model is None:
|
||||
models = self.list_models()
|
||||
model = models['data'][0]['id']
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": stream
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize client
|
||||
client = VLLMClient("http://localhost:8000")
|
||||
|
||||
print("="*60)
|
||||
print("vLLM API Client Examples")
|
||||
print("="*60)
|
||||
|
||||
# Example 1: List models
|
||||
print("\n1. Listing available models...")
|
||||
models = client.list_models()
|
||||
for model in models['data']:
|
||||
print(f" - {model['id']}")
|
||||
|
||||
# Example 2: Simple completion
|
||||
print("\n2. Simple completion...")
|
||||
result = client.complete(
|
||||
prompt="The capital of France is",
|
||||
max_tokens=10,
|
||||
temperature=0.0
|
||||
)
|
||||
print(f" Prompt: The capital of France is")
|
||||
print(f" Response: {result['choices'][0]['text']}")
|
||||
|
||||
# Example 3: Chat completion
|
||||
print("\n3. Chat completion...")
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful AI assistant."},
|
||||
{"role": "user", "content": "What is the Blackwell GPU architecture?"}
|
||||
]
|
||||
result = client.chat(
|
||||
messages=messages,
|
||||
max_tokens=100,
|
||||
temperature=0.7
|
||||
)
|
||||
print(f" User: {messages[1]['content']}")
|
||||
print(f" Assistant: {result['choices'][0]['message']['content']}")
|
||||
|
||||
# Example 4: Streaming completion
|
||||
print("\n4. Streaming completion...")
|
||||
print(" Prompt: Write a short poem about AI")
|
||||
print(" Response: ", end="", flush=True)
|
||||
|
||||
stream = client.complete(
|
||||
prompt="Write a short poem about AI",
|
||||
max_tokens=50,
|
||||
temperature=0.8,
|
||||
stream=True
|
||||
)
|
||||
|
||||
for line in stream:
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line.decode('utf-8').removeprefix('data: '))
|
||||
if 'choices' in data and len(data['choices']) > 0:
|
||||
token = data['choices'][0].get('text', '')
|
||||
print(token, end="", flush=True)
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
|
||||
print("\n")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
48
examples/basic_inference.py
Normal file
48
examples/basic_inference.py
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Basic vLLM Inference Example for DGX Spark
|
||||
Demonstrates simple text generation using the vLLM Python API
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
def main():
|
||||
# Initialize the model
|
||||
# Use a smaller model for testing, replace with your preferred model
|
||||
print("Loading model...")
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.9,
|
||||
max_model_len=2048
|
||||
)
|
||||
|
||||
# Define prompts
|
||||
prompts = [
|
||||
"What is the NVIDIA DGX Spark?",
|
||||
"Explain the Blackwell GPU architecture in simple terms.",
|
||||
"Write a haiku about artificial intelligence."
|
||||
]
|
||||
|
||||
# Configure sampling parameters
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7,
|
||||
top_p=0.95,
|
||||
max_tokens=100,
|
||||
stop=["</s>", "\n\n\n"]
|
||||
)
|
||||
|
||||
# Generate responses
|
||||
print("\nGenerating responses...\n")
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print results
|
||||
for i, output in enumerate(outputs):
|
||||
print(f"{'='*60}")
|
||||
print(f"Prompt {i+1}: {prompts[i]}")
|
||||
print(f"{'-'*60}")
|
||||
print(f"Response: {output.outputs[0].text}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
777
install.sh
Normal file
777
install.sh
Normal file
@@ -0,0 +1,777 @@
|
||||
#!/bin/bash
|
||||
################################################################################
|
||||
# vLLM Installation Script for NVIDIA DGX Spark (Blackwell GB10)
|
||||
# Version: 1.1.0
|
||||
# Author: DGX Spark Community
|
||||
# License: MIT
|
||||
#
|
||||
# This script automates the complete installation of vLLM on DGX Spark systems
|
||||
# with Blackwell GB10 GPUs, including all necessary fixes and optimizations.
|
||||
#
|
||||
# Usage: ./install.sh [OPTIONS]
|
||||
# Can also be run via: curl -fsSL <url>/install.sh | bash
|
||||
#
|
||||
# Options:
|
||||
# --install-dir DIR Installation directory (default: $PWD/vllm-install)
|
||||
# --vllm-version HASH vLLM git commit (default: 66a168a19 - tested with Blackwell)
|
||||
# --python-version VER Python version (default: 3.12)
|
||||
# --skip-tests Skip post-installation tests
|
||||
# --help Show this help message
|
||||
################################################################################
|
||||
|
||||
set -e # Exit on error
|
||||
set -o pipefail # Catch errors in pipes
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default configuration
|
||||
INSTALL_DIR="$PWD/vllm-install"
|
||||
VLLM_VERSION="66a168a197ba214a5b70a74fa2e713c9eeb3251a" # vLLM commit with Blackwell fixes
|
||||
TRITON_VERSION="4caa0328bf8df64896dd5f6fb9df41b0eb2e750a" # Triton commit that works with Blackwell
|
||||
PYTHON_VERSION="3.12"
|
||||
SKIP_TESTS=false
|
||||
|
||||
# GitHub raw URL for downloading repo assets when run outside the repo
|
||||
REPO_RAW_URL="https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main"
|
||||
|
||||
# Script directory (only meaningful when run from a local clone)
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd || echo "")"
|
||||
|
||||
################################################################################
|
||||
# Helper Functions
|
||||
################################################################################
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
print_header() {
|
||||
echo ""
|
||||
echo -e "${BLUE}========================================${NC}"
|
||||
echo -e "${BLUE}$1${NC}"
|
||||
echo -e "${BLUE}========================================${NC}"
|
||||
echo ""
|
||||
}
|
||||
|
||||
check_command() {
|
||||
if command -v "$1" &> /dev/null; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Auto-confirm when stdin is not a terminal (e.g. curl | bash)
|
||||
confirm_or_default_yes() {
|
||||
local prompt="$1"
|
||||
if [ -t 0 ]; then
|
||||
read -p "$prompt (y/N) " -n 1 -r
|
||||
echo
|
||||
[[ $REPLY =~ ^[Yy]$ ]]
|
||||
else
|
||||
log_info "Non-interactive mode: auto-confirming"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Pre-flight Checks
|
||||
################################################################################
|
||||
|
||||
preflight_checks() {
|
||||
print_header "Pre-flight System Checks"
|
||||
|
||||
log_info "Checking system requirements..."
|
||||
|
||||
# Check if running on ARM64
|
||||
ARCH=$(uname -m)
|
||||
if [[ "$ARCH" != "aarch64" ]] && [[ "$ARCH" != "arm64" ]]; then
|
||||
log_warning "This script is designed for ARM64 architecture (DGX Spark)"
|
||||
log_warning "Detected architecture: $ARCH"
|
||||
fi
|
||||
|
||||
# Check for NVIDIA GPU
|
||||
if ! check_command nvidia-smi; then
|
||||
log_error "nvidia-smi not found. NVIDIA drivers required."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check GPU type
|
||||
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
|
||||
log_info "Detected GPU: $GPU_NAME"
|
||||
|
||||
if [[ ! "$GPU_NAME" =~ "GB10" ]]; then
|
||||
log_warning "This script is optimized for NVIDIA GB10 (Blackwell)"
|
||||
log_warning "Your GPU: $GPU_NAME"
|
||||
if ! confirm_or_default_yes "Continue anyway?"; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check CUDA
|
||||
if ! check_command nvcc; then
|
||||
# Check common CUDA install locations
|
||||
if [ -x "/usr/local/cuda/bin/nvcc" ]; then
|
||||
export PATH="/usr/local/cuda/bin:$PATH"
|
||||
log_info "Found CUDA at /usr/local/cuda, added to PATH"
|
||||
else
|
||||
log_error "CUDA toolkit not found. Please install CUDA 13.0+"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | cut -d',' -f1)
|
||||
log_info "CUDA version: $CUDA_VERSION"
|
||||
|
||||
# Check for Python development headers (required for Triton build)
|
||||
PYTHON_INCLUDE="/usr/include/python${PYTHON_VERSION}/patchlevel.h"
|
||||
if [ ! -f "$PYTHON_INCLUDE" ]; then
|
||||
log_warning "Python ${PYTHON_VERSION} development headers not found"
|
||||
log_info "Installing python${PYTHON_VERSION}-dev (requires sudo)..."
|
||||
if sudo apt-get install -y "python${PYTHON_VERSION}-dev"; then
|
||||
log_success "python${PYTHON_VERSION}-dev installed"
|
||||
else
|
||||
log_error "Failed to install python${PYTHON_VERSION}-dev"
|
||||
log_error "Please install manually: sudo apt install python${PYTHON_VERSION}-dev"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_info "Python ${PYTHON_VERSION} development headers found"
|
||||
fi
|
||||
|
||||
# Check disk space (need ~50GB)
|
||||
AVAILABLE_SPACE=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | sed 's/G//')
|
||||
if [[ "$AVAILABLE_SPACE" -lt 50 ]]; then
|
||||
log_error "Insufficient disk space. Need at least 50GB, have ${AVAILABLE_SPACE}GB"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Pre-flight checks passed!"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Install uv Package Manager
|
||||
################################################################################
|
||||
|
||||
install_uv() {
|
||||
print_header "Step 1/8: Installing uv Package Manager"
|
||||
|
||||
if check_command uv; then
|
||||
UV_VERSION=$(uv --version | awk '{print $2}')
|
||||
log_info "uv already installed: v$UV_VERSION"
|
||||
else
|
||||
log_info "Installing uv..."
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
log_success "uv installed successfully"
|
||||
fi
|
||||
|
||||
# Verify installation
|
||||
if ! check_command uv; then
|
||||
log_error "uv installation failed"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Create Python Virtual Environment
|
||||
################################################################################
|
||||
|
||||
create_venv() {
|
||||
print_header "Step 2/8: Creating Python Virtual Environment"
|
||||
|
||||
VENV_DIR="$INSTALL_DIR/.vllm"
|
||||
|
||||
if [ -d "$VENV_DIR" ]; then
|
||||
log_warning "Virtual environment already exists at $VENV_DIR"
|
||||
if confirm_or_default_yes "Remove and recreate?"; then
|
||||
rm -rf "$VENV_DIR"
|
||||
else
|
||||
log_info "Using existing virtual environment"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
log_info "Creating Python $PYTHON_VERSION virtual environment..."
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
cd "$INSTALL_DIR"
|
||||
uv venv .vllm --python "$PYTHON_VERSION"
|
||||
|
||||
# Upgrade setuptools to 77+ so PEP 639 license fields are supported
|
||||
# (fixes flashinfer-python build failure)
|
||||
log_info "Upgrading setuptools in venv for PEP 639 license support..."
|
||||
uv pip install --python "$VENV_DIR/bin/python" --upgrade setuptools
|
||||
|
||||
log_success "Virtual environment created at $VENV_DIR"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Install PyTorch
|
||||
################################################################################
|
||||
|
||||
install_pytorch() {
|
||||
print_header "Step 3/8: Installing PyTorch with CUDA 13.0"
|
||||
|
||||
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||
|
||||
log_info "Installing latest PyTorch for cu130..."
|
||||
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
||||
|
||||
# Verify PyTorch installation
|
||||
log_info "Verifying PyTorch installation..."
|
||||
python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())"
|
||||
|
||||
log_success "PyTorch installed successfully"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Clone and Build Triton
|
||||
################################################################################
|
||||
|
||||
install_triton() {
|
||||
print_header "Step 4/8: Installing Triton from Main Branch"
|
||||
|
||||
TRITON_DIR="$INSTALL_DIR/triton"
|
||||
|
||||
if [ -d "$TRITON_DIR" ]; then
|
||||
log_info "Triton directory exists, updating..."
|
||||
cd "$TRITON_DIR"
|
||||
git fetch
|
||||
else
|
||||
log_info "Cloning Triton repository..."
|
||||
cd "$INSTALL_DIR"
|
||||
git clone https://github.com/triton-lang/triton.git
|
||||
cd triton
|
||||
fi
|
||||
|
||||
log_info "Checking out Triton commit $TRITON_VERSION (tested with Blackwell)..."
|
||||
git checkout "$TRITON_VERSION"
|
||||
git submodule update --init --recursive
|
||||
|
||||
log_info "Installing Triton build dependencies..."
|
||||
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||
uv pip install pip cmake ninja pybind11
|
||||
|
||||
log_info "Building Triton (this takes ~5 minutes)..."
|
||||
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
|
||||
python -m pip install --no-build-isolation -v . 2>&1 | tee "$INSTALL_DIR/triton-build.log"
|
||||
|
||||
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
||||
log_error "Triton build failed. See $INSTALL_DIR/triton-build.log for details"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Record the installed triton version so we can protect it later
|
||||
TRITON_INSTALLED_VERSION=$(python -c "import triton; print(triton.__version__)" 2>/dev/null || echo "unknown")
|
||||
log_info "Triton version installed: $TRITON_INSTALLED_VERSION"
|
||||
|
||||
log_success "Triton installed successfully"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Install Additional Dependencies
|
||||
################################################################################
|
||||
|
||||
install_dependencies() {
|
||||
print_header "Step 5/8: Installing Additional Dependencies"
|
||||
|
||||
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||
|
||||
log_info "Installing xgrammar, setuptools-scm, and apache-tvm-ffi..."
|
||||
uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
|
||||
|
||||
log_success "Dependencies installed successfully"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Clone vLLM
|
||||
################################################################################
|
||||
|
||||
clone_vllm() {
|
||||
print_header "Step 6/8: Cloning vLLM Repository"
|
||||
|
||||
VLLM_DIR="$INSTALL_DIR/vllm"
|
||||
|
||||
if [ -d "$VLLM_DIR" ]; then
|
||||
log_warning "vLLM directory already exists at $VLLM_DIR"
|
||||
if confirm_or_default_yes "Remove and re-clone?"; then
|
||||
rm -rf "$VLLM_DIR"
|
||||
else
|
||||
log_info "Using existing vLLM directory"
|
||||
cd "$VLLM_DIR"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
log_info "Cloning vLLM $VLLM_VERSION..."
|
||||
cd "$INSTALL_DIR"
|
||||
git clone --recursive https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
git checkout "$VLLM_VERSION"
|
||||
git submodule update --init --recursive
|
||||
|
||||
log_success "vLLM repository cloned"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Apply Critical Fixes
|
||||
################################################################################
|
||||
|
||||
apply_fixes() {
|
||||
print_header "Step 7/8: Applying Critical Fixes"
|
||||
|
||||
cd "$INSTALL_DIR/vllm"
|
||||
|
||||
# Fix 1: pyproject.toml license field
|
||||
log_info "Fixing pyproject.toml license field..."
|
||||
sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' pyproject.toml
|
||||
sed -i '/^license-files = /d' pyproject.toml
|
||||
|
||||
# Fix 2: CMakeLists.txt SM100/SM120 MOE kernels (check if already applied)
|
||||
if grep -q 'cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"' CMakeLists.txt; then
|
||||
log_info "CMakeLists.txt SM100/SM120 fix already applied"
|
||||
else
|
||||
log_info "Applying CMakeLists.txt SM100/SM120 fix..."
|
||||
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
|
||||
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a"/' CMakeLists.txt
|
||||
fi
|
||||
|
||||
# Fix 3: flashinfer-python license field (pre-emptive fix)
|
||||
log_info "Pre-fixing flashinfer-python license issue..."
|
||||
rm -rf "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" 2>/dev/null || true
|
||||
|
||||
# Fix 4: GPT-OSS Triton MOE kernels for Qwen3/gpt-oss support
|
||||
# Try local repo patches/ first, then download from GitHub
|
||||
PATCH_FILE=""
|
||||
if [ -f "$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch" ]; then
|
||||
PATCH_FILE="$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch"
|
||||
else
|
||||
log_info "Downloading GPT-OSS Triton MOE patch from repository..."
|
||||
PATCH_FILE="$INSTALL_DIR/gpt_oss_triton_moe.patch"
|
||||
if curl -fsSL "$REPO_RAW_URL/patches/gpt_oss_triton_moe.patch" -o "$PATCH_FILE" 2>/dev/null; then
|
||||
log_info "Patch downloaded successfully"
|
||||
else
|
||||
PATCH_FILE=""
|
||||
log_warning "Could not download GPT-OSS Triton MOE patch (skipping)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$PATCH_FILE" ] && [ -f "$PATCH_FILE" ]; then
|
||||
log_info "Applying GPT-OSS Triton MOE kernel patch for Qwen3/gpt-oss support..."
|
||||
if patch --dry-run -p1 < "$PATCH_FILE" > /dev/null 2>&1; then
|
||||
patch -p1 < "$PATCH_FILE"
|
||||
log_success "GPT-OSS Triton MOE kernel patch applied"
|
||||
else
|
||||
log_warning "GPT-OSS Triton MOE kernel patch already applied or conflicts"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Configure use_existing_torch
|
||||
log_info "Configuring vLLM to use existing PyTorch..."
|
||||
python3 use_existing_torch.py
|
||||
|
||||
log_success "All fixes applied successfully"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Build and Install vLLM
|
||||
################################################################################
|
||||
|
||||
build_vllm() {
|
||||
print_header "Step 8/8: Building vLLM (15-20 minutes)"
|
||||
|
||||
cd "$INSTALL_DIR/vllm"
|
||||
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||
|
||||
# Set environment variables
|
||||
export TORCH_CUDA_ARCH_LIST=12.1a
|
||||
export VLLM_USE_FLASHINFER_MXFP4_MOE=1
|
||||
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
|
||||
# Create a constraints file to prevent uv from replacing our
|
||||
# custom-built Triton with a PyPI version
|
||||
log_info "Creating constraints to protect pinned Triton build..."
|
||||
TRITON_CONSTRAINT="$INSTALL_DIR/constraints.txt"
|
||||
TRITON_INSTALLED=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "")
|
||||
if [ -n "$TRITON_INSTALLED" ]; then
|
||||
echo "triton==${TRITON_INSTALLED}" > "$TRITON_CONSTRAINT"
|
||||
log_info "Pinning triton==${TRITON_INSTALLED} during vLLM build"
|
||||
else
|
||||
echo "" > "$TRITON_CONSTRAINT"
|
||||
log_warning "Could not detect installed Triton version"
|
||||
fi
|
||||
|
||||
log_info "Starting vLLM build..."
|
||||
log_warning "This will take 15-20 minutes. Go grab a coffee!"
|
||||
|
||||
set +e # Don't exit on error, we'll handle it
|
||||
UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
|
||||
--no-build-isolation --prerelease=allow -e . \
|
||||
2>&1 | tee "$INSTALL_DIR/vllm-build.log"
|
||||
BUILD_STATUS=${PIPESTATUS[0]}
|
||||
set -e
|
||||
|
||||
if [ $BUILD_STATUS -ne 0 ]; then
|
||||
if grep -q "flashinfer.*license.*must be valid" "$INSTALL_DIR/vllm-build.log"; then
|
||||
log_warning "Build failed due to flashinfer-python license issue"
|
||||
log_info "Upgrading setuptools and retrying..."
|
||||
|
||||
# Ensure setuptools is new enough
|
||||
uv pip install --upgrade setuptools
|
||||
|
||||
# Also patch the cached flashinfer pyproject.toml as a belt-and-suspenders fix
|
||||
find "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" -name "pyproject.toml" 2>/dev/null | while read f; do
|
||||
sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' "$f"
|
||||
sed -i '/^license-files = /d' "$f"
|
||||
done
|
||||
|
||||
log_info "Retrying vLLM build..."
|
||||
UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
|
||||
--no-build-isolation --prerelease=allow -e .
|
||||
else
|
||||
log_error "vLLM build failed. See $INSTALL_DIR/vllm-build.log for details"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Verify Triton wasn't replaced
|
||||
TRITON_AFTER=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "unknown")
|
||||
if [ -n "$TRITON_INSTALLED" ] && [ "$TRITON_AFTER" != "$TRITON_INSTALLED" ]; then
|
||||
log_warning "Triton was changed during vLLM install: $TRITON_INSTALLED -> $TRITON_AFTER"
|
||||
log_warning "Rebuilding pinned Triton from source..."
|
||||
cd "$INSTALL_DIR/triton"
|
||||
git checkout "$TRITON_VERSION"
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
|
||||
python -m pip install --no-build-isolation --force-reinstall -v .
|
||||
cd "$INSTALL_DIR/vllm"
|
||||
fi
|
||||
|
||||
log_success "vLLM built successfully!"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Create Helper Scripts
|
||||
################################################################################
|
||||
|
||||
create_helper_scripts() {
|
||||
print_header "Creating Helper Scripts"
|
||||
|
||||
# Create environment activation script
|
||||
log_info "Creating vllm_env.sh..."
|
||||
cat > "$INSTALL_DIR/vllm_env.sh" << 'ENVEOF'
|
||||
#!/bin/bash
|
||||
# vLLM Environment Configuration for DGX Spark
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$SCRIPT_DIR/.vllm/bin/activate"
|
||||
export TORCH_CUDA_ARCH_LIST=12.1a
|
||||
export VLLM_USE_FLASHINFER_MXFP4_MOE=1
|
||||
CUDA_PATH=$(ls -d /usr/local/cuda* 2>/dev/null | head -1)
|
||||
export TRITON_PTXAS_PATH="$CUDA_PATH/bin/ptxas"
|
||||
export PATH="$CUDA_PATH/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
|
||||
# Cache tiktoken encodings to avoid re-downloading
|
||||
export TIKTOKEN_CACHE_DIR="$SCRIPT_DIR/.tiktoken_cache"
|
||||
mkdir -p "$TIKTOKEN_CACHE_DIR"
|
||||
echo "=== vLLM Environment Active ==="
|
||||
echo "Virtual env: $VIRTUAL_ENV"
|
||||
echo "CUDA arch: $TORCH_CUDA_ARCH_LIST"
|
||||
echo "Python: $(which python)"
|
||||
echo "==============================="
|
||||
ENVEOF
|
||||
chmod +x "$INSTALL_DIR/vllm_env.sh"
|
||||
|
||||
# Create vllm-serve.sh (embedded so it works with curl|bash)
|
||||
log_info "Creating vllm-serve.sh..."
|
||||
cat > "$INSTALL_DIR/vllm-serve.sh" << 'SERVEEOF'
|
||||
#!/bin/bash
|
||||
# vLLM Server Startup Script for DGX Spark
|
||||
# Usage: ./vllm-serve.sh <model_name> [port]
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
|
||||
PORT="${2:-8000}"
|
||||
VLLM_DIR="$SCRIPT_DIR/vllm"
|
||||
ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||
|
||||
# Check if server is already running
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
PID=$(cat "$PID_FILE")
|
||||
if ps -p $PID > /dev/null 2>&1; then
|
||||
echo "ERROR: vLLM server is already running (PID: $PID)"
|
||||
echo "Use ./vllm-stop.sh to stop it first"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Source environment
|
||||
source "$ENV_SCRIPT"
|
||||
|
||||
echo "----------------------------------------------------------------------"
|
||||
echo "Starting vLLM Server on DGX Spark"
|
||||
echo "----------------------------------------------------------------------"
|
||||
echo "Model: $MODEL"
|
||||
echo "Port: $PORT"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo "PID file: $PID_FILE"
|
||||
echo "----------------------------------------------------------------------"
|
||||
|
||||
# Start server in background
|
||||
cd "$VLLM_DIR"
|
||||
nohup python -m vllm.entrypoints.openai.api_server \
|
||||
--model "$MODEL" \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
> "$LOG_FILE" 2>&1 &
|
||||
|
||||
echo $! > "$PID_FILE"
|
||||
echo "OK: Server started with PID: $(cat $PID_FILE)"
|
||||
echo "OK: Waiting for server to be ready..."
|
||||
|
||||
sleep 5
|
||||
if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
|
||||
echo "OK: Server is running!"
|
||||
echo ""
|
||||
echo "Test with: curl http://localhost:$PORT/v1/models"
|
||||
echo "View logs: tail -f $LOG_FILE"
|
||||
echo "Stop server: ./vllm-stop.sh"
|
||||
else
|
||||
echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
|
||||
rm -f "$PID_FILE"
|
||||
exit 1
|
||||
fi
|
||||
SERVEEOF
|
||||
chmod +x "$INSTALL_DIR/vllm-serve.sh"
|
||||
|
||||
# Create vllm-stop.sh
|
||||
log_info "Creating vllm-stop.sh..."
|
||||
cat > "$INSTALL_DIR/vllm-stop.sh" << 'STOPEOF'
|
||||
#!/bin/bash
|
||||
# vLLM Server Stop Script for DGX Spark
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
|
||||
if [ ! -f "$PID_FILE" ]; then
|
||||
echo "No vLLM server PID file found. Server may not be running."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PID=$(cat "$PID_FILE")
|
||||
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Stopping vLLM server (PID: $PID)..."
|
||||
kill $PID
|
||||
|
||||
for i in {1..10}; do
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "OK: Server stopped successfully"
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
if ps -p $PID > /dev/null 2>&1; then
|
||||
echo "Server did not stop gracefully. Force killing..."
|
||||
kill -9 $PID
|
||||
sleep 1
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "OK: Server force stopped"
|
||||
rm -f "$PID_FILE"
|
||||
else
|
||||
echo "ERROR: Failed to stop server"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
STOPEOF
|
||||
chmod +x "$INSTALL_DIR/vllm-stop.sh"
|
||||
|
||||
# Create vllm-status.sh
|
||||
log_info "Creating vllm-status.sh..."
|
||||
cat > "$INSTALL_DIR/vllm-status.sh" << 'STATUSEOF'
|
||||
#!/bin/bash
|
||||
# vLLM Server Status Script for DGX Spark
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||
|
||||
echo "----------------------------------------------------------------------"
|
||||
echo "vLLM Server Status on DGX Spark"
|
||||
echo "----------------------------------------------------------------------"
|
||||
|
||||
if [ ! -f "$PID_FILE" ]; then
|
||||
echo "Status: NOT RUNNING (no PID file found)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PID=$(cat "$PID_FILE")
|
||||
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "Status: NOT RUNNING (stale PID file)"
|
||||
echo "Cleaning up PID file..."
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Status: RUNNING"
|
||||
echo "PID: $PID"
|
||||
echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
|
||||
echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
|
||||
echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
|
||||
echo ""
|
||||
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
echo "Recent log entries (last 10 lines):"
|
||||
echo "----------------------------------------------------------------------"
|
||||
tail -n 10 "$LOG_FILE"
|
||||
else
|
||||
echo "Log file not found: $LOG_FILE"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "----------------------------------------------------------------------"
|
||||
STATUSEOF
|
||||
chmod +x "$INSTALL_DIR/vllm-status.sh"
|
||||
|
||||
log_success "Helper scripts created in $INSTALL_DIR"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Post-Installation Tests
|
||||
################################################################################
|
||||
|
||||
run_tests() {
|
||||
if [ "$SKIP_TESTS" = true ]; then
|
||||
log_info "Skipping post-installation tests"
|
||||
return
|
||||
fi
|
||||
|
||||
print_header "Post-Installation Tests"
|
||||
|
||||
source "$INSTALL_DIR/vllm_env.sh"
|
||||
|
||||
log_info "Test 1: Import vLLM..."
|
||||
python -c "import vllm; print('vLLM version:', vllm.__version__)"
|
||||
|
||||
log_info "Test 2: Check CUDA availability..."
|
||||
python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print('CUDA available')"
|
||||
|
||||
log_info "Test 3: Check GPU detection..."
|
||||
python -c "import torch; print('GPU count:', torch.cuda.device_count()); print('GPU name:', torch.cuda.get_device_name(0))"
|
||||
|
||||
log_success "All tests passed!"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Parse Command Line Arguments
|
||||
################################################################################
|
||||
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--install-dir)
|
||||
INSTALL_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--vllm-version)
|
||||
VLLM_VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--python-version)
|
||||
PYTHON_VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--skip-tests)
|
||||
SKIP_TESTS=true
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
head -20 "$0" | grep "^#" | sed 's/^# //'
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
log_info "Use --help for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Main Installation Flow
|
||||
################################################################################
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
|
||||
print_header "vLLM Installation for DGX Spark (Blackwell GB10)"
|
||||
log_info "Installation directory: $INSTALL_DIR"
|
||||
log_info "vLLM version: $VLLM_VERSION"
|
||||
log_info "Python version: $PYTHON_VERSION"
|
||||
echo ""
|
||||
|
||||
preflight_checks
|
||||
install_uv
|
||||
create_venv
|
||||
install_pytorch
|
||||
install_triton
|
||||
install_dependencies
|
||||
clone_vllm
|
||||
apply_fixes
|
||||
build_vllm
|
||||
create_helper_scripts
|
||||
run_tests
|
||||
|
||||
print_header "Installation Complete!"
|
||||
echo ""
|
||||
log_success "vLLM has been successfully installed!"
|
||||
echo ""
|
||||
echo -e "${GREEN}Next steps:${NC}"
|
||||
echo "1. Activate the environment:"
|
||||
echo " ${BLUE}source $INSTALL_DIR/vllm_env.sh${NC}"
|
||||
echo ""
|
||||
echo "2. Start vLLM server:"
|
||||
echo " ${BLUE}cd $INSTALL_DIR${NC}"
|
||||
echo " ${BLUE}./vllm-serve.sh${NC}"
|
||||
echo ""
|
||||
echo "3. Test the API:"
|
||||
echo " ${BLUE}curl http://localhost:8000/v1/models${NC}"
|
||||
echo ""
|
||||
echo "For more information, see README.md"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
77
patches/gpt_oss_triton_moe.patch
Normal file
77
patches/gpt_oss_triton_moe.patch
Normal file
@@ -0,0 +1,77 @@
|
||||
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||
index badedfc54..e05c0eea4 100644
|
||||
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||
@@ -20,9 +20,16 @@ logger = init_logger(__name__)
|
||||
if has_triton_kernels():
|
||||
try:
|
||||
import triton_kernels.swiglu
|
||||
- from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
|
||||
- from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
|
||||
- from triton_kernels.tensor import Bitmatrix
|
||||
+ from triton_kernels.matmul_ogs import (
|
||||
+ FnSpecs,
|
||||
+ FusedActivation,
|
||||
+ GatherIndx,
|
||||
+ RoutingData,
|
||||
+ ScatterIndx,
|
||||
+ matmul_ogs,
|
||||
+ )
|
||||
+ from triton_kernels.tensor import BIT, Bitmatrix, SparseMatrix, make_ragged_tensor_metadata
|
||||
+ from triton_kernels.topk import topk as triton_topk
|
||||
except (AttributeError, ImportError) as e:
|
||||
logger.error(
|
||||
"Failed to import Triton kernels. Please make sure your triton "
|
||||
@@ -84,8 +91,17 @@ def triton_kernel_moe_forward(
|
||||
global_num_experts: int = -1,
|
||||
expert_map: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
- routing_data, gather_idx, scatter_idx = routing(
|
||||
- gating_output, topk, sm_first=not renormalize
|
||||
+ # Use new topk API instead of deprecated routing
|
||||
+ sm_first = not renormalize
|
||||
+ if sm_first:
|
||||
+ gating_output = torch.softmax(gating_output, dim=-1)
|
||||
+ sparse_logits = triton_topk(
|
||||
+ gating_output, topk, apply_softmax=not sm_first, y_indx=None, n_rows=None
|
||||
+ )
|
||||
+
|
||||
+ # Convert to routing data using the existing make_routing_data function
|
||||
+ routing_data, gather_idx, scatter_idx = make_routing_data(
|
||||
+ sparse_logits.indx, sparse_logits.vals, gating_output.shape[-1]
|
||||
)
|
||||
|
||||
return triton_kernel_fused_experts(
|
||||
@@ -202,14 +218,29 @@ def make_routing_data(
|
||||
bitmatrix_shape = [n_rows, bm_cols * 32]
|
||||
bitmatrix_shape_max = [n_rows, None]
|
||||
bitmatrix = Bitmatrix(
|
||||
- bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
|
||||
+ bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
|
||||
)
|
||||
|
||||
# matmul_ogs expects invalid topk_weights to be -1s
|
||||
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
|
||||
- routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
|
||||
- bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
|
||||
+
|
||||
+ # Use new SparseMatrix API instead of deprecated routing_from_bitmatrix
|
||||
+ sparse_logits = SparseMatrix(indx=topk_ids, vals=topk_weights, mask=bitmatrix)
|
||||
+ dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
|
||||
+ combine_indx = sparse_logits.mask_metadata.row_sorted_indx
|
||||
+ ragged_batch_metadata = make_ragged_tensor_metadata(
|
||||
+ sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0]
|
||||
+ )
|
||||
+ gate_scal = sparse_logits.vals.flatten()[combine_indx]
|
||||
+ routing_data = RoutingData(
|
||||
+ gate_scal,
|
||||
+ ragged_batch_metadata.block_sizes,
|
||||
+ num_local_experts,
|
||||
+ num_topk,
|
||||
+ ragged_batch_metadata,
|
||||
)
|
||||
+ gather_indx = GatherIndx(combine_indx, dispatch_indx)
|
||||
+ scatter_indx = ScatterIndx(dispatch_indx, combine_indx)
|
||||
|
||||
return routing_data, gather_indx, scatter_indx
|
||||
|
||||
28
requirements.txt
Normal file
28
requirements.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
# Core Dependencies for vLLM on DGX Spark (Blackwell GB10)
|
||||
# Note: This file is for reference only. The install.sh script handles
|
||||
# all dependency installation with proper ordering and build flags.
|
||||
|
||||
# PyTorch with CUDA 13.0 support (installs latest available on cu130 index)
|
||||
--index-url https://download.pytorch.org/whl/cu130
|
||||
torch
|
||||
torchvision
|
||||
torchaudio
|
||||
|
||||
# Triton (must be built from source - see install.sh)
|
||||
# Pinned to commit 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
|
||||
# triton @ git+https://github.com/triton-lang/triton.git@4caa0328
|
||||
|
||||
# vLLM dependencies
|
||||
xgrammar>=0.1.26
|
||||
setuptools-scm>=9.2.2
|
||||
setuptools>=77.0.0 # Required for PEP 639 license field support
|
||||
apache-tvm-ffi==0.1.0b15 # Pre-release required
|
||||
|
||||
# Build dependencies
|
||||
pybind11>=3.0.0
|
||||
ninja>=1.13.0
|
||||
|
||||
# Optional but recommended
|
||||
flashinfer-python>=0.4.1
|
||||
transformers>=4.57.0
|
||||
huggingface-hub>=0.36.0
|
||||
67
scripts/vllm-serve.sh
Normal file
67
scripts/vllm-serve.sh
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
# vLLM Server Startup Script for DGX Spark
|
||||
# Usage: ./vllm-serve.sh <model_name> [port]
|
||||
|
||||
set -e
|
||||
|
||||
# Determine installation directory (where this script is located)
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Configuration
|
||||
MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
|
||||
PORT="${2:-8000}"
|
||||
VLLM_DIR="$SCRIPT_DIR/vllm"
|
||||
ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||
|
||||
# Check if server is already running
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
PID=$(cat "$PID_FILE")
|
||||
if ps -p $PID > /dev/null 2>&1; then
|
||||
echo "ERROR: vLLM server is already running (PID: $PID)"
|
||||
echo "Use ./vllm-stop.sh to stop it first"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Source environment
|
||||
source "$ENV_SCRIPT"
|
||||
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
echo "Starting vLLM Server on DGX Spark"
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
echo "Model: $MODEL"
|
||||
echo "Port: $PORT"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo "PID file: $PID_FILE"
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
|
||||
# Start server in background
|
||||
cd "$VLLM_DIR"
|
||||
nohup python -m vllm.entrypoints.openai.api_server \
|
||||
--model "$MODEL" \
|
||||
--trust-remote-code \
|
||||
--host 0.0.0.0 \
|
||||
--port "$PORT" \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
> "$LOG_FILE" 2>&1 &
|
||||
|
||||
# Save PID
|
||||
echo $! > "$PID_FILE"
|
||||
echo "OK: Server started with PID: $(cat $PID_FILE)"
|
||||
echo "OK: Waiting for server to be ready..."
|
||||
|
||||
# Wait for server to be ready
|
||||
sleep 5
|
||||
if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
|
||||
echo "OK: Server is running!"
|
||||
echo ""
|
||||
echo "Test with: curl http://localhost:$PORT/v1/models"
|
||||
echo "View logs: tail -f $LOG_FILE"
|
||||
echo "Stop server: ./vllm-stop.sh"
|
||||
else
|
||||
echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
|
||||
rm -f "$PID_FILE"
|
||||
exit 1
|
||||
fi
|
||||
45
scripts/vllm-status.sh
Normal file
45
scripts/vllm-status.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# vLLM Server Status Script for DGX Spark
|
||||
|
||||
# Determine installation directory (where this script is located)
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
echo "vLLM Server Status on DGX Spark"
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
|
||||
if [ ! -f "$PID_FILE" ]; then
|
||||
echo "Status: NOT RUNNING (no PID file found)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PID=$(cat "$PID_FILE")
|
||||
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "Status: NOT RUNNING (stale PID file)"
|
||||
echo "Cleaning up PID file..."
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Status: RUNNING"
|
||||
echo "PID: $PID"
|
||||
echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
|
||||
echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
|
||||
echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
|
||||
echo ""
|
||||
|
||||
# Check if log file exists and show last few lines
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
echo "Recent log entries (last 10 lines):"
|
||||
echo "-" | tr '-' '-' | head -c 70 && echo
|
||||
tail -n 10 "$LOG_FILE"
|
||||
else
|
||||
echo "Log file not found: $LOG_FILE"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||
47
scripts/vllm-stop.sh
Normal file
47
scripts/vllm-stop.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# vLLM Server Stop Script for DGX Spark
|
||||
|
||||
# Determine installation directory (where this script is located)
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||
|
||||
if [ ! -f "$PID_FILE" ]; then
|
||||
echo "No vLLM server PID file found. Server may not be running."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PID=$(cat "$PID_FILE")
|
||||
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Stopping vLLM server (PID: $PID)..."
|
||||
kill $PID
|
||||
|
||||
# Wait for process to terminate
|
||||
for i in {1..10}; do
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "OK: Server stopped successfully"
|
||||
rm -f "$PID_FILE"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# Force kill if still running
|
||||
if ps -p $PID > /dev/null 2>&1; then
|
||||
echo "Server did not stop gracefully. Force killing..."
|
||||
kill -9 $PID
|
||||
sleep 1
|
||||
if ! ps -p $PID > /dev/null 2>&1; then
|
||||
echo "OK: Server force stopped"
|
||||
rm -f "$PID_FILE"
|
||||
else
|
||||
echo "ERROR: Failed to stop server"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
Reference in New Issue
Block a user