first commit
This commit is contained in:
77
.gitignore
vendored
Normal file
77
.gitignore
vendored
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual Environments
|
||||||
|
.venv
|
||||||
|
.vllm/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDEs
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
vllm-server.log
|
||||||
|
*.out
|
||||||
|
*.err
|
||||||
|
|
||||||
|
# Build artifacts
|
||||||
|
*.o
|
||||||
|
*.a
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
CMakeCache.txt
|
||||||
|
CMakeFiles/
|
||||||
|
cmake_install.cmake
|
||||||
|
Makefile
|
||||||
|
|
||||||
|
# CUDA
|
||||||
|
*.ptx
|
||||||
|
*.cubin
|
||||||
|
|
||||||
|
# Local installation directories
|
||||||
|
triton/
|
||||||
|
vllm/
|
||||||
|
.cache/
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
tmp/
|
||||||
|
temp/
|
||||||
|
*.tmp
|
||||||
|
*.bak
|
||||||
|
|
||||||
|
# Model downloads
|
||||||
|
models/
|
||||||
|
*.safetensors
|
||||||
|
*.bin
|
||||||
|
*.gguf
|
||||||
380
CLUSTER.md
Normal file
380
CLUSTER.md
Normal file
@@ -0,0 +1,380 @@
|
|||||||
|
# vLLM Cluster Mode Setup for DGX Spark
|
||||||
|
|
||||||
|
This guide covers setting up multi-node vLLM deployment on DGX Spark systems using distributed inference.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Multiple DGX Spark systems with vLLM installed (use `install.sh` on each node)
|
||||||
|
- All nodes on the same network with direct connectivity
|
||||||
|
- SSH access between nodes (passwordless SSH recommended)
|
||||||
|
- Same CUDA and vLLM versions across all nodes
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ spark-alpha │
|
||||||
|
│ (Master/Head) │
|
||||||
|
│ - API Server │
|
||||||
|
│ - Request Router │
|
||||||
|
│ - Model Weights │
|
||||||
|
└──────────┬──────────┘
|
||||||
|
│
|
||||||
|
├─────────────────────┐
|
||||||
|
│ │
|
||||||
|
┌──────────▼──────────┐ ┌──────▼──────────┐
|
||||||
|
│ spark-omega │ │ spark-gamma │
|
||||||
|
│ (Worker 1) │ │ (Worker 2) │
|
||||||
|
│ - Inference │ │ - Inference │
|
||||||
|
│ - GPU Compute │ │ - GPU Compute │
|
||||||
|
└─────────────────────┘ └─────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 1: Install vLLM on All Nodes
|
||||||
|
|
||||||
|
Run the installer on each node:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On spark-alpha (master)
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||||
|
|
||||||
|
# On spark-omega (worker 1)
|
||||||
|
ssh spark-omega.local
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||||
|
|
||||||
|
# On spark-gamma (worker 2)
|
||||||
|
ssh spark-gamma.local
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 2: Configure Network Settings
|
||||||
|
|
||||||
|
Ensure all nodes can communicate on the required ports:
|
||||||
|
|
||||||
|
- **8000**: vLLM API server (master only)
|
||||||
|
- **29500**: PyTorch distributed backend (all nodes)
|
||||||
|
- **Random ports**: Ray cluster communication
|
||||||
|
|
||||||
|
Open firewall if needed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On all nodes
|
||||||
|
sudo ufw allow 8000/tcp
|
||||||
|
sudo ufw allow 29500/tcp
|
||||||
|
sudo ufw allow 6379/tcp # Ray GCS
|
||||||
|
sudo ufw allow 8265/tcp # Ray Dashboard
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 3: Set Up Passwordless SSH (Optional but Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On master node
|
||||||
|
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
|
||||||
|
|
||||||
|
# Copy to worker nodes
|
||||||
|
ssh-copy-id spark-omega.local
|
||||||
|
ssh-copy-id spark-gamma.local
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
ssh spark-omega.local "echo 'Connection successful'"
|
||||||
|
ssh spark-gamma.local "echo 'Connection successful'"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 4: Start Ray Cluster
|
||||||
|
|
||||||
|
### On Master Node (spark-alpha)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Assuming vllm-install is in your home directory
|
||||||
|
source ~/vllm-install/vllm_env.sh
|
||||||
|
|
||||||
|
# Start Ray head node
|
||||||
|
ray start --head \
|
||||||
|
--port=6379 \
|
||||||
|
--dashboard-host=0.0.0.0 \
|
||||||
|
--dashboard-port=8265 \
|
||||||
|
--num-gpus=1
|
||||||
|
|
||||||
|
# Note the output: "To connect to this Ray cluster, use: ray start --address='MASTER_IP:6379'"
|
||||||
|
```
|
||||||
|
|
||||||
|
### On Worker Nodes (spark-omega, spark-gamma)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source ~/vllm-install/vllm_env.sh
|
||||||
|
|
||||||
|
# Replace MASTER_IP with spark-alpha's IP address
|
||||||
|
ray start --address='MASTER_IP:6379' --num-gpus=1
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify cluster status:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ray status
|
||||||
|
```
|
||||||
|
|
||||||
|
You should see all nodes listed.
|
||||||
|
|
||||||
|
## Step 5: Start vLLM with Tensor Parallelism
|
||||||
|
|
||||||
|
### Method 1: Tensor Parallelism (Recommended for Large Models)
|
||||||
|
|
||||||
|
Tensor parallelism splits model layers across multiple GPUs.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On master node
|
||||||
|
source ~/vllm-install/vllm_env.sh
|
||||||
|
|
||||||
|
vllm serve \
|
||||||
|
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
This will automatically distribute the model across 2 GPUs in the Ray cluster.
|
||||||
|
|
||||||
|
### Method 2: Pipeline Parallelism
|
||||||
|
|
||||||
|
Pipeline parallelism splits model stages across GPUs.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve \
|
||||||
|
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||||
|
--pipeline-parallel-size 2 \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Method 3: Combined Parallelism
|
||||||
|
|
||||||
|
For very large models, combine tensor and pipeline parallelism:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve \
|
||||||
|
--model "meta-llama/Llama-3.1-405B-Instruct" \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--pipeline-parallel-size 2 \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 6: Test Cluster Inference
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test from master node
|
||||||
|
curl http://localhost:8000/v1/models
|
||||||
|
|
||||||
|
# Test from external machine
|
||||||
|
curl http://spark-alpha.local:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
||||||
|
"prompt": "Explain distributed inference in 3 sentences.",
|
||||||
|
"max_tokens": 100,
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 7: Monitor Cluster
|
||||||
|
|
||||||
|
### Ray Dashboard
|
||||||
|
|
||||||
|
Access at: http://spark-alpha.local:8265
|
||||||
|
|
||||||
|
Shows:
|
||||||
|
- Node status and resources
|
||||||
|
- Task execution
|
||||||
|
- GPU utilization
|
||||||
|
- Memory usage
|
||||||
|
|
||||||
|
### vLLM Metrics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On master node
|
||||||
|
tail -f ~/vllm-install/vllm-server.log
|
||||||
|
|
||||||
|
# Check GPU usage across cluster
|
||||||
|
ray exec 'nvidia-smi'
|
||||||
|
```
|
||||||
|
|
||||||
|
### System Monitoring
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check Ray cluster status
|
||||||
|
ray status
|
||||||
|
|
||||||
|
# Monitor GPU usage on specific node
|
||||||
|
ssh spark-omega.local nvidia-smi -l 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Workers Not Connecting
|
||||||
|
|
||||||
|
**Problem**: Workers can't connect to Ray head node
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Check firewall: `sudo ufw status`
|
||||||
|
2. Verify head node IP: `ray status` on master
|
||||||
|
3. Check network connectivity: `ping spark-alpha.local`
|
||||||
|
4. Ensure same Ray version on all nodes: `ray --version`
|
||||||
|
|
||||||
|
### OOM Errors with Large Models
|
||||||
|
|
||||||
|
**Problem**: Out of memory when loading large models
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Increase tensor parallelism: `--tensor-parallel-size 4`
|
||||||
|
2. Reduce memory utilization: `--gpu-memory-utilization 0.8`
|
||||||
|
3. Enable CPU offloading: `--cpu-offload-gb 8`
|
||||||
|
4. Use quantization: `--quantization awq` or `--quantization gptq`
|
||||||
|
|
||||||
|
### Model Loading Hangs
|
||||||
|
|
||||||
|
**Problem**: Model download/loading takes forever
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Pre-download model on all nodes:
|
||||||
|
```bash
|
||||||
|
# On each node
|
||||||
|
python -c "from transformers import AutoModel; AutoModel.from_pretrained('meta-llama/Llama-3.1-70B-Instruct')"
|
||||||
|
```
|
||||||
|
2. Use shared storage (NFS) for model cache
|
||||||
|
3. Check network bandwidth between nodes
|
||||||
|
|
||||||
|
### Uneven GPU Utilization
|
||||||
|
|
||||||
|
**Problem**: Some GPUs idle while others maxed out
|
||||||
|
|
||||||
|
**Solutions**:
|
||||||
|
1. Verify tensor parallel configuration
|
||||||
|
2. Check Ray resource allocation: `ray status`
|
||||||
|
3. Ensure balanced request distribution
|
||||||
|
4. Monitor with: `ray exec 'nvidia-smi'`
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### Custom Ray Resources
|
||||||
|
|
||||||
|
Assign custom resources to nodes for fine-grained control:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On worker with high memory
|
||||||
|
ray start --address='MASTER_IP:6379' \
|
||||||
|
--num-gpus=1 \
|
||||||
|
--resources='{"highmem": 1}'
|
||||||
|
|
||||||
|
# Use in vLLM
|
||||||
|
vllm serve --model "..." --placement-group-resources='{"highmem": 1}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Distributed Model Cache
|
||||||
|
|
||||||
|
Share model weights via NFS to avoid redundant downloads:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On NFS server (e.g., master)
|
||||||
|
sudo apt install nfs-kernel-server
|
||||||
|
echo "$HOME/.cache/huggingface *(rw,sync,no_subtree_check)" | sudo tee -a /etc/exports
|
||||||
|
sudo exportfs -a
|
||||||
|
|
||||||
|
# On workers
|
||||||
|
sudo apt install nfs-common
|
||||||
|
sudo mkdir -p $HOME/.cache/huggingface
|
||||||
|
sudo mount spark-alpha.local:$HOME/.cache/huggingface $HOME/.cache/huggingface
|
||||||
|
```
|
||||||
|
|
||||||
|
### Load Balancing with nginx
|
||||||
|
|
||||||
|
For production deployments, use nginx to load balance across multiple vLLM instances:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
upstream vllm_cluster {
|
||||||
|
least_conn;
|
||||||
|
server spark-alpha.local:8000;
|
||||||
|
server spark-omega.local:8000;
|
||||||
|
server spark-gamma.local:8000;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
location / {
|
||||||
|
proxy_pass http://vllm_cluster;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cluster Management Scripts
|
||||||
|
|
||||||
|
### Start Cluster
|
||||||
|
|
||||||
|
Create `start-cluster.sh`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# Start Ray cluster on all nodes
|
||||||
|
|
||||||
|
ssh spark-alpha.local "source ~/vllm-install/vllm_env.sh && ray start --head --port=6379"
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
MASTER_IP=$(ssh spark-alpha.local "hostname -I | awk '{print \$1}'")
|
||||||
|
|
||||||
|
ssh spark-omega.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
|
||||||
|
ssh spark-gamma.local "source ~/vllm-install/vllm_env.sh && ray start --address='${MASTER_IP}:6379'"
|
||||||
|
|
||||||
|
echo "Cluster started. Check status with: ray status"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stop Cluster
|
||||||
|
|
||||||
|
Create `stop-cluster.sh`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# Stop Ray cluster on all nodes
|
||||||
|
|
||||||
|
for node in spark-alpha.local spark-omega.local spark-gamma.local; do
|
||||||
|
echo "Stopping Ray on $node..."
|
||||||
|
ssh $node "ray stop --force"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Cluster stopped."
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### For Maximum Throughput
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve \
|
||||||
|
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--max-num-seqs 256 \
|
||||||
|
--max-num-batched-tokens 8192 \
|
||||||
|
--gpu-memory-utilization 0.95
|
||||||
|
```
|
||||||
|
|
||||||
|
### For Low Latency
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve \
|
||||||
|
--model "meta-llama/Llama-3.1-70B-Instruct" \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--max-num-seqs 32 \
|
||||||
|
--disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [vLLM Distributed Inference](https://docs.vllm.ai/en/latest/serving/distributed_serving.html)
|
||||||
|
- [Ray Cluster Setup](https://docs.ray.io/en/latest/cluster/getting-started.html)
|
||||||
|
- [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html)
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues specific to DGX Spark cluster setup, please open an issue on GitHub.
|
||||||
134
CRITICAL_FIX_ANALYSIS.md
Normal file
134
CRITICAL_FIX_ANALYSIS.md
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
# Critical Blackwell GB10 Fixes for vLLM
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Three critical fixes are required for vLLM on Blackwell GB10 (sm_121a) GPUs with CUDA 13.0+:
|
||||||
|
|
||||||
|
1. **CMakeLists.txt SM120 Support** - Add missing architecture
|
||||||
|
2. **vLLM Commit Version** - Use commit with Blackwell/Triton fixes
|
||||||
|
3. **Triton Version Pinning** - Use tested working commit
|
||||||
|
|
||||||
|
## Fix 1: CMakeLists.txt SM120 Support
|
||||||
|
|
||||||
|
### Root Cause
|
||||||
|
|
||||||
|
vLLM v0.11.1rc3 CMakeLists.txt has **incomplete architecture support** for Blackwell GB10 (sm_121a) MOE kernels when using CUDA 13.0+.
|
||||||
|
|
||||||
|
## The Problem
|
||||||
|
|
||||||
|
For CUDA 13.0+, the code uses these branches:
|
||||||
|
- **Line 490**: Regular MOE kernels
|
||||||
|
- **Line 671**: Grouped MM MOE kernels
|
||||||
|
|
||||||
|
Original v0.11.1rc3:
|
||||||
|
```cmake
|
||||||
|
# Line 490
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
# Line 671
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**BOTH lines are missing `12.0f` (SM120) support!**
|
||||||
|
|
||||||
|
## The Fix
|
||||||
|
|
||||||
|
Both lines need `12.0f` added:
|
||||||
|
```cmake
|
||||||
|
# Line 490
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
# Line 671
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Symptoms
|
||||||
|
|
||||||
|
Without this fix:
|
||||||
|
```
|
||||||
|
ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100RN2at6TensorERKS0_S3_S3_S3_S3_S3_S3_S3_S3_bb
|
||||||
|
```
|
||||||
|
|
||||||
|
The MOE kernels for SM100/SM120 aren't compiled, causing import failures.
|
||||||
|
|
||||||
|
## Why install.sh Works
|
||||||
|
|
||||||
|
The sed command on line 323:
|
||||||
|
```bash
|
||||||
|
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
This replaces **ALL** occurrences, fixing both lines 490 and 671 in one command.
|
||||||
|
|
||||||
|
## Verified Solution
|
||||||
|
|
||||||
|
Tested on NVIDIA DGX Spark with Blackwell GB10, CUDA 13.0:
|
||||||
|
- [OK] Line 490 fixed: `"10.0f;11.0f;12.0f"`
|
||||||
|
- [OK] Line 671 fixed: `"10.0f;11.0f;12.0f"`
|
||||||
|
- [OK] vLLM imports successfully
|
||||||
|
- [OK] No cutlass_moe_mm_sm100 symbol errors
|
||||||
|
- [OK] Build time: ~19 minutes
|
||||||
|
|
||||||
|
## Fix 2: vLLM Commit Version
|
||||||
|
|
||||||
|
### Issue
|
||||||
|
|
||||||
|
vLLM tag `v0.11.1rc3` lacks critical Triton/PyTorch Inductor fixes for Blackwell.
|
||||||
|
|
||||||
|
### Solution
|
||||||
|
|
||||||
|
Use commit `66a168a197ba214a5b70a74fa2e713c9eeb3251a` (6 commits ahead of v0.11.1rc3):
|
||||||
|
- Contains Triton JIT compilation fixes
|
||||||
|
- Includes PyTorch Inductor optimizations for Blackwell
|
||||||
|
- Adds proper backend registration handling
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd vllm
|
||||||
|
git checkout 66a168a197ba214a5b70a74fa2e713c9eeb3251a
|
||||||
|
git submodule update --init --recursive
|
||||||
|
```
|
||||||
|
|
||||||
|
## Fix 3: Triton Version Pinning
|
||||||
|
|
||||||
|
### Issue
|
||||||
|
|
||||||
|
Latest Triton main branch (as of late October 2025) has intermittent JITFunction compilation issues with PyTorch Inductor on Blackwell.
|
||||||
|
|
||||||
|
### Solution
|
||||||
|
|
||||||
|
Pin to tested working commit: `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` (October 25, 2025)
|
||||||
|
- Verified stable with Blackwell GB10
|
||||||
|
- Passes all compilation tests
|
||||||
|
- No JITFunction.constexprs errors
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd triton
|
||||||
|
git checkout 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
|
||||||
|
git submodule update --init --recursive
|
||||||
|
python -m pip install --no-build-isolation -v .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Complete Verified Configuration
|
||||||
|
|
||||||
|
| Component | Version/Commit | Notes |
|
||||||
|
|-----------|---------------|-------|
|
||||||
|
| **vLLM** | `66a168a197ba214a5b70a74fa2e713c9eeb3251a` | 6 commits ahead of v0.11.1rc3 |
|
||||||
|
| **Triton** | `4caa0328bf8df64896dd5f6fb9df41b0eb2e750a` | October 25, 2025 |
|
||||||
|
| **PyTorch** | `2.9.0+cu130` | From vLLM requirements |
|
||||||
|
| **CUDA** | `13.0` (V13.0.88) | System CUDA |
|
||||||
|
| **Python** | `3.12.3` | |
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
Verified working with:
|
||||||
|
```bash
|
||||||
|
python -c "from vllm import LLM, SamplingParams; \
|
||||||
|
llm = LLM(model='Qwen/Qwen2.5-0.5B-Instruct', max_model_len=512); \
|
||||||
|
print(llm.generate(['Hello'], SamplingParams(max_tokens=20)))"
|
||||||
|
```
|
||||||
|
|
||||||
|
**All tests pass**: Import, compilation, CUDA graphs, and text generation all work correctly.
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 DGX Spark Community
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
312
README.md
Normal file
312
README.md
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
# vLLM Setup for NVIDIA DGX Spark (Blackwell GB10)
|
||||||
|
|
||||||
|
**One-command installation** of vLLM for NVIDIA DGX Spark systems with GB10 GPUs (Blackwell architecture, sm_121).
|
||||||
|
|
||||||
|
This repository provides a dgx-spark tested, ready setup script that handles all the complexities of building vLLM on the DGX Spark platform, including:
|
||||||
|
- CUDA 13.0 support with Blackwell-specific optimizations
|
||||||
|
- Critical fixes for SM100/SM120 MOE kernel compilation
|
||||||
|
- Triton 3.5.0 from main branch (required for sm_121a support)
|
||||||
|
- PyTorch 2.9.0 with CUDA 13.0 bindings
|
||||||
|
- All necessary build fixes and workarounds
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
**One-command installation** - installs to `./vllm-install` in your current directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||||
|
```
|
||||||
|
|
||||||
|
Or specify a custom directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash -s -- --install-dir ~/my/custom/path
|
||||||
|
```
|
||||||
|
|
||||||
|
**Installation time:** ~20-30 minutes (mostly compilation)
|
||||||
|
|
||||||
|
### Alternative: Clone and Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
|
||||||
|
cd dgx-spark-vllm-setup
|
||||||
|
./install.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Installation Options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./install.sh [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--install-dir DIR Installation directory (default: ./vllm-install)
|
||||||
|
--vllm-version TAG vLLM git tag/branch (default: v0.11.1rc3)
|
||||||
|
--python-version VER Python version (default: 3.12)
|
||||||
|
--skip-tests Skip post-installation tests
|
||||||
|
--help Show help message
|
||||||
|
```
|
||||||
|
|
||||||
|
## System Requirements
|
||||||
|
|
||||||
|
- **Hardware:** NVIDIA DGX Spark with GB10 GPU (Blackwell sm_121)
|
||||||
|
- **OS:** Ubuntu 22.04+ (tested on Linux 6.11.0 ARM64)
|
||||||
|
- **CUDA:** 13.0 or later (driver 580.95.05+)
|
||||||
|
- **Disk Space:** ~50GB free
|
||||||
|
- **RAM:** 8GB+ recommended during build
|
||||||
|
|
||||||
|
## What Gets Installed
|
||||||
|
|
||||||
|
Installed to `./vllm-install` (or your custom directory):
|
||||||
|
|
||||||
|
- **Python 3.12** virtual environment at `.vllm/`
|
||||||
|
- **PyTorch 2.9.0+cu130** with full CUDA 13.0 support
|
||||||
|
- **Triton 3.5.0+git** from main branch (pre-release with Blackwell support)
|
||||||
|
- **vLLM 0.11.1rc3+** with all Blackwell-specific patches
|
||||||
|
- **Helper scripts** for managing vLLM server
|
||||||
|
- **Environment activation** script (`vllm_env.sh`)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
All examples assume you're in the installation directory (default: `./vllm-install`).
|
||||||
|
|
||||||
|
### Activate Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd vllm-install
|
||||||
|
source vllm_env.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Start vLLM Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./vllm-serve.sh # Default: Qwen2.5-0.5B on port 8000
|
||||||
|
./vllm-serve.sh "facebook/opt-125m" 8001 # Custom model and port
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Server Status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./vllm-status.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stop Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./vllm-stop.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List models
|
||||||
|
curl http://localhost:8000/v1/models
|
||||||
|
|
||||||
|
# Generate completion
|
||||||
|
curl http://localhost:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
"prompt": "Hello, how are you?",
|
||||||
|
"max_tokens": 50
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Python API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
trust_remote_code=True,
|
||||||
|
gpu_memory_utilization=0.9
|
||||||
|
)
|
||||||
|
|
||||||
|
prompts = ["Tell me about DGX Spark"]
|
||||||
|
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
print(outputs[0].outputs[0].text)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Critical Fixes Applied
|
||||||
|
|
||||||
|
This installer automatically applies the following critical fixes:
|
||||||
|
|
||||||
|
### 1. CMakeLists.txt SM100/SM120 MOE Kernel Fix
|
||||||
|
|
||||||
|
**Issue:** vLLM's MOE kernels for SM100/SM120 Blackwell architectures were incomplete
|
||||||
|
**Fix:** Added `12.0f` and `12.1a` to SCALED_MM_ARCHS in CMakeLists.txt
|
||||||
|
|
||||||
|
```cmake
|
||||||
|
# CUDA 13.0+ path (line ~671)
|
||||||
|
# Before
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
# After
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
# Older CUDA path (line ~673)
|
||||||
|
# Before
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
# After
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. pyproject.toml License Field Format
|
||||||
|
|
||||||
|
**Issue:** Newer setuptools requires structured license format
|
||||||
|
**Fix:** Convert license string to dict format in both vLLM and flashinfer-python
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# Before
|
||||||
|
license = "Apache-2.0"
|
||||||
|
license-files = ["LICENSE"]
|
||||||
|
|
||||||
|
# After
|
||||||
|
license = {text = "Apache-2.0"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Applied to:**
|
||||||
|
- vLLM's pyproject.toml
|
||||||
|
- flashinfer-python's pyproject.toml (patched during build)
|
||||||
|
|
||||||
|
### 3. GPT-OSS Triton MOE Kernels for Qwen3/gpt-oss Support
|
||||||
|
|
||||||
|
**Issue:** vLLM's GPT-OSS MOE kernel implementation uses deprecated Triton routing API
|
||||||
|
**Fix:** Update to new Triton kernel API (topk and SparseMatrix)
|
||||||
|
|
||||||
|
**Changes:**
|
||||||
|
- Replace deprecated `routing()` with `triton_topk()`
|
||||||
|
- Replace deprecated `routing_from_bitmatrix()` with `SparseMatrix()`
|
||||||
|
- Add support for `GatherIndx`, `ScatterIndx`, and new ragged tensor metadata
|
||||||
|
|
||||||
|
**Enables support for:**
|
||||||
|
- Qwen3 models with MOE architecture
|
||||||
|
- gpt-oss models using Triton kernels
|
||||||
|
- Latest Triton kernel optimizations for Blackwell
|
||||||
|
|
||||||
|
### 4. Triton Main Branch Requirement
|
||||||
|
|
||||||
|
**Issue:** Official Triton 3.5.0 release has bugs with sm_121a
|
||||||
|
**Fix:** Build Triton from main branch with latest Blackwell fixes
|
||||||
|
|
||||||
|
## Architecture-Specific Configuration
|
||||||
|
|
||||||
|
The installer sets these critical environment variables:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
TORCH_CUDA_ARCH_LIST=12.1a # Blackwell sm_121
|
||||||
|
VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer MOE optimization
|
||||||
|
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler
|
||||||
|
TIKTOKEN_CACHE_DIR=$INSTALL_DIR/.tiktoken_cache # Cache tiktoken encodings locally
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cluster Mode Setup
|
||||||
|
|
||||||
|
To set up multi-node vLLM cluster:
|
||||||
|
|
||||||
|
1. Run this installer on all nodes
|
||||||
|
2. Follow [CLUSTER.md](./CLUSTER.md) for configuration
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Build Fails with "TypeError: can only concatenate str (not 'NoneType') to str"
|
||||||
|
|
||||||
|
This is a known Triton editable-mode build issue. The installer works around this by:
|
||||||
|
- Building Triton in non-editable mode
|
||||||
|
- Or copying pre-built Triton from another node
|
||||||
|
|
||||||
|
### Symbol Error: cutlass_moe_mm_sm100
|
||||||
|
|
||||||
|
**Symptom:** `ImportError: undefined symbol: _Z20cutlass_moe_mm_sm100`
|
||||||
|
**Solution:** Ensure CMakeLists.txt fix is applied (done automatically by installer)
|
||||||
|
|
||||||
|
### PyTorch CUDA Capability Warning
|
||||||
|
|
||||||
|
**Symptom:** Warning about GPU capability 12.1 vs PyTorch max 12.0
|
||||||
|
**Status:** Harmless warning - PyTorch 2.9.0+cu130 works correctly with GB10
|
||||||
|
|
||||||
|
### ImportError: No module named 'vllm'
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
source vllm-install/vllm_env.sh
|
||||||
|
python -c "import vllm; print(vllm.__version__)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
vllm-install/
|
||||||
|
├── .vllm/ # Python virtual environment
|
||||||
|
├── vllm/ # vLLM source (editable install)
|
||||||
|
├── triton/ # Triton source
|
||||||
|
├── vllm_env.sh # Environment activation script
|
||||||
|
├── vllm-serve.sh # Start server
|
||||||
|
├── vllm-stop.sh # Stop server
|
||||||
|
├── vllm-status.sh # Check status
|
||||||
|
└── vllm-server.log # Server logs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Manual Installation
|
||||||
|
|
||||||
|
If you prefer to understand each step:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Install uv package manager
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
export PATH="$HOME/.local/bin:$PATH"
|
||||||
|
|
||||||
|
# 2. Create installation directory and Python virtual environment
|
||||||
|
mkdir -p vllm-install && cd vllm-install
|
||||||
|
uv venv .vllm --python 3.12
|
||||||
|
source .vllm/bin/activate
|
||||||
|
|
||||||
|
# 3. Install PyTorch with CUDA 13.0
|
||||||
|
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
||||||
|
|
||||||
|
# 4. Clone and build Triton from main
|
||||||
|
git clone https://github.com/triton-lang/triton.git
|
||||||
|
cd triton
|
||||||
|
uv pip install pip cmake ninja pybind11
|
||||||
|
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas python -m pip install --no-build-isolation .
|
||||||
|
|
||||||
|
# 5. Install additional dependencies
|
||||||
|
uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
|
||||||
|
|
||||||
|
# 6. Clone vLLM
|
||||||
|
cd ..
|
||||||
|
git clone --recursive https://github.com/vllm-project/vllm.git
|
||||||
|
cd vllm
|
||||||
|
git checkout v0.11.1rc3
|
||||||
|
|
||||||
|
# 7. Apply fixes (see scripts/apply-fixes.sh)
|
||||||
|
# 8. Build vLLM (see install.sh for full process)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Version Information
|
||||||
|
|
||||||
|
- **vLLM:** 0.11.1rc4.dev6+g66a168a19.d20251026
|
||||||
|
- **PyTorch:** 2.9.0+cu130
|
||||||
|
- **Triton:** 3.5.0+git4caa0328
|
||||||
|
- **CUDA:** 13.0
|
||||||
|
- **Python:** 3.12.3
|
||||||
|
- **Target Architecture:** sm_121 (Blackwell GB10)
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Issues and pull requests welcome! This installer is maintained by the DGX Spark community.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [NVIDIA Forum Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
|
||||||
|
- [vLLM GitHub](https://github.com/vllm-project/vllm)
|
||||||
|
- [Triton GitHub](https://github.com/triton-lang/triton)
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License - See [LICENSE](./LICENSE)
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
Developed and tested on NVIDIA DGX Spark systems. Special thanks to the vLLM and Triton communities.
|
||||||
246
SUMMARY.md
Normal file
246
SUMMARY.md
Normal file
@@ -0,0 +1,246 @@
|
|||||||
|
# Repository Summary
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This repository provides a **production-ready, one-command installation** of vLLM for NVIDIA DGX Spark systems with Blackwell GB10 GPUs (sm_121 architecture).
|
||||||
|
|
||||||
|
## What's Included
|
||||||
|
|
||||||
|
### Core Files
|
||||||
|
|
||||||
|
1. **install.sh** (500+ lines)
|
||||||
|
- Fully automated installation script
|
||||||
|
- Pre-flight system checks
|
||||||
|
- 8-step installation pipeline
|
||||||
|
- Post-installation testing
|
||||||
|
- Command-line argument support
|
||||||
|
|
||||||
|
2. **README.md** (300+ lines)
|
||||||
|
- Quick start guide
|
||||||
|
- System requirements
|
||||||
|
- Usage examples
|
||||||
|
- Critical fixes documentation
|
||||||
|
- Troubleshooting guide
|
||||||
|
|
||||||
|
3. **CLUSTER.md** (400+ lines)
|
||||||
|
- Multi-node setup instructions
|
||||||
|
- Ray cluster configuration
|
||||||
|
- Tensor/pipeline parallelism
|
||||||
|
- Performance tuning
|
||||||
|
- Load balancing examples
|
||||||
|
|
||||||
|
4. **requirements.txt**
|
||||||
|
- Complete dependency list
|
||||||
|
- PyTorch 2.9.0+cu130
|
||||||
|
- All required packages
|
||||||
|
|
||||||
|
### Helper Scripts (scripts/)
|
||||||
|
|
||||||
|
- **vllm-serve.sh** - Start vLLM server with configurable model/port
|
||||||
|
- **vllm-stop.sh** - Gracefully stop server
|
||||||
|
- **vllm-status.sh** - Check server status and logs
|
||||||
|
|
||||||
|
### Examples (examples/)
|
||||||
|
|
||||||
|
- **basic_inference.py** - Simple Python API usage
|
||||||
|
- **api_client.py** - OpenAI-compatible REST API client
|
||||||
|
- **README.md** - Usage instructions and API examples
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
- **.gitignore** - Excludes build artifacts, venvs, logs
|
||||||
|
- **LICENSE** - MIT license
|
||||||
|
|
||||||
|
## Technical Specifications
|
||||||
|
|
||||||
|
### Target Platform
|
||||||
|
- **Hardware:** NVIDIA DGX Spark with GB10 GPU
|
||||||
|
- **Architecture:** Blackwell sm_121 (compute capability 12.1)
|
||||||
|
- **OS:** Ubuntu 22.04+ ARM64
|
||||||
|
- **CUDA:** 13.0+ (driver 580.95.05+)
|
||||||
|
|
||||||
|
### Software Stack
|
||||||
|
- **Python:** 3.12.3
|
||||||
|
- **PyTorch:** 2.9.0+cu130
|
||||||
|
- **Triton:** 3.5.0+git (from main branch)
|
||||||
|
- **vLLM:** 0.11.1rc4+
|
||||||
|
- **Package Manager:** uv (fast Python package installer)
|
||||||
|
|
||||||
|
### Critical Fixes Applied
|
||||||
|
|
||||||
|
1. **CMakeLists.txt (line 671)**
|
||||||
|
- Added `12.0f` to SCALED_MM_ARCHS for SM100 MOE kernels
|
||||||
|
- Enables Blackwell GPU compilation
|
||||||
|
|
||||||
|
2. **pyproject.toml**
|
||||||
|
- Changed `license = "Apache-2.0"` to `license = {text = "Apache-2.0"}`
|
||||||
|
- Removed deprecated `license-files` field
|
||||||
|
- Compatible with setuptools 77.0+
|
||||||
|
|
||||||
|
3. **Triton Build**
|
||||||
|
- Must use main branch (not release 3.5.0)
|
||||||
|
- Non-editable install to avoid setuptools bug
|
||||||
|
- Custom PTXAS path for CUDA integration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
TORCH_CUDA_ARCH_LIST=12.1a # Blackwell architecture
|
||||||
|
VLLM_USE_FLASHINFER_MXFP4_MOE=1 # Enable FlashInfer optimization
|
||||||
|
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # CUDA PTX assembler
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation Overview
|
||||||
|
|
||||||
|
The `install.sh` script performs these steps:
|
||||||
|
|
||||||
|
1. **Pre-flight Checks**
|
||||||
|
- Verify ARM64 architecture
|
||||||
|
- Check NVIDIA GPU (GB10)
|
||||||
|
- Validate CUDA 13.0+
|
||||||
|
- Ensure 50GB+ disk space
|
||||||
|
|
||||||
|
2. **Install uv Package Manager**
|
||||||
|
- Fast Python package installer
|
||||||
|
- Required for efficient dependency resolution
|
||||||
|
|
||||||
|
3. **Create Virtual Environment**
|
||||||
|
- Python 3.12 virtual environment
|
||||||
|
- Isolated from system packages
|
||||||
|
|
||||||
|
4. **Install PyTorch**
|
||||||
|
- PyTorch 2.9.0 with CUDA 13.0 bindings
|
||||||
|
- Verify CUDA availability
|
||||||
|
|
||||||
|
5. **Build Triton**
|
||||||
|
- Clone from GitHub main branch
|
||||||
|
- Build with Blackwell support
|
||||||
|
- Non-editable install
|
||||||
|
|
||||||
|
6. **Install Dependencies**
|
||||||
|
- xgrammar, setuptools-scm
|
||||||
|
- apache-tvm-ffi (prerelease)
|
||||||
|
- Build tools
|
||||||
|
|
||||||
|
7. **Clone and Fix vLLM**
|
||||||
|
- Clone v0.11.1rc3
|
||||||
|
- Apply CMakeLists.txt fix
|
||||||
|
- Apply pyproject.toml fix
|
||||||
|
- Configure use_existing_torch
|
||||||
|
|
||||||
|
8. **Build vLLM**
|
||||||
|
- 15-20 minute compilation
|
||||||
|
- All CUDA kernels for Blackwell
|
||||||
|
- Editable install for development
|
||||||
|
|
||||||
|
9. **Create Helper Scripts**
|
||||||
|
- Environment activation script
|
||||||
|
- Server management scripts
|
||||||
|
- Logging configuration
|
||||||
|
|
||||||
|
10. **Post-Installation Tests**
|
||||||
|
- Import vLLM
|
||||||
|
- Check CUDA availability
|
||||||
|
- Verify GPU detection
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# One-command installation
|
||||||
|
curl -fsSL https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main/install.sh | bash
|
||||||
|
|
||||||
|
# Or clone and run
|
||||||
|
git clone https://github.com/eelbaz/dgx-spark-vllm-setup.git
|
||||||
|
cd dgx-spark-vllm-setup
|
||||||
|
./install.sh
|
||||||
|
|
||||||
|
# Activate environment (assuming installation in current directory)
|
||||||
|
cd vllm-install
|
||||||
|
source vllm_env.sh
|
||||||
|
|
||||||
|
# Start server
|
||||||
|
./vllm-serve.sh
|
||||||
|
|
||||||
|
# Test API
|
||||||
|
curl http://localhost:8000/v1/models
|
||||||
|
```
|
||||||
|
|
||||||
|
## Repository Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
dgx-spark-vllm-setup/
|
||||||
|
├── README.md # Main documentation
|
||||||
|
├── CLUSTER.md # Multi-node setup guide
|
||||||
|
├── SUMMARY.md # This file
|
||||||
|
├── LICENSE # MIT license
|
||||||
|
├── .gitignore # Git ignore rules
|
||||||
|
├── install.sh # Main installation script
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── scripts/
|
||||||
|
│ ├── vllm-serve.sh # Start vLLM server
|
||||||
|
│ ├── vllm-stop.sh # Stop server
|
||||||
|
│ └── vllm-status.sh # Check status
|
||||||
|
└── examples/
|
||||||
|
├── README.md # Examples documentation
|
||||||
|
├── basic_inference.py # Python API example
|
||||||
|
└── api_client.py # REST API example
|
||||||
|
```
|
||||||
|
|
||||||
|
## Known Issues & Workarounds
|
||||||
|
|
||||||
|
### Triton Editable Build Fails
|
||||||
|
**Error:** `TypeError: can only concatenate str (not 'NoneType') to str`
|
||||||
|
**Workaround:** Use non-editable install (`uv pip install --no-build-isolation .`)
|
||||||
|
|
||||||
|
### PyTorch CUDA Capability Warning
|
||||||
|
**Warning:** GPU capability 12.1 vs PyTorch max 12.0
|
||||||
|
**Status:** Harmless - PyTorch 2.9.0+cu130 works correctly with GB10
|
||||||
|
|
||||||
|
### apache-tvm-ffi Prerelease
|
||||||
|
**Error:** `No solution found when resolving dependencies`
|
||||||
|
**Fix:** Use `--prerelease=allow` flag with uv pip install
|
||||||
|
|
||||||
|
## Testing Status
|
||||||
|
|
||||||
|
- [OK] Single-node installation on spark-alpha.local
|
||||||
|
- [OK] Single-node installation on spark-omega.local
|
||||||
|
- [OK] vLLM server startup and API functionality
|
||||||
|
- [OK] Model inference (Qwen/Qwen2.5-0.5B-Instruct)
|
||||||
|
- [IN PROGRESS] Multi-node cluster mode (documented, not yet tested)
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
- [ ] Add cluster mode testing results
|
||||||
|
- [ ] Include performance benchmarks
|
||||||
|
- [ ] Add Dockerfile for containerized deployment
|
||||||
|
- [ ] Create Ansible playbook for multi-node automation
|
||||||
|
- [ ] Add monitoring and logging setup (Prometheus/Grafana)
|
||||||
|
- [ ] Include model quantization examples (AWQ, GPTQ)
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Contributions welcome! Please open issues or pull requests on GitHub.
|
||||||
|
|
||||||
|
## Community & Support
|
||||||
|
|
||||||
|
- **GitHub Issues:** Report bugs and feature requests
|
||||||
|
- **NVIDIA Forum:** [DGX Spark vLLM Discussion](https://forums.developer.nvidia.com/t/run-vllm-in-spark/348862)
|
||||||
|
- **vLLM Docs:** [Official Documentation](https://docs.vllm.ai/)
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License - See LICENSE file for details.
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
Developed and tested on NVIDIA DGX Spark systems. Special thanks to:
|
||||||
|
- vLLM project team
|
||||||
|
- Triton compiler team
|
||||||
|
- NVIDIA DGX Spark community
|
||||||
|
- Claude Code (AI assistant) for documentation automation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Version:** 1.0.0
|
||||||
|
**Last Updated:** 2025-10-26
|
||||||
|
**Tested On:** DGX Spark with GB10, CUDA 13.0, Ubuntu 22.04 ARM64
|
||||||
225
examples/README.md
Normal file
225
examples/README.md
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
# vLLM Examples for DGX Spark
|
||||||
|
|
||||||
|
This directory contains example scripts demonstrating various ways to use vLLM on DGX Spark systems.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Ensure vLLM is installed and the environment is activated:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Assuming vllm-install is in your home directory
|
||||||
|
source ~/vllm-install/vllm_env.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### 1. Basic Inference (`basic_inference.py`)
|
||||||
|
|
||||||
|
Simple text generation using the vLLM Python API.
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python basic_inference.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- Loading a model with vLLM
|
||||||
|
- Configuring sampling parameters
|
||||||
|
- Generating multiple completions
|
||||||
|
- Batch processing
|
||||||
|
|
||||||
|
### 2. API Client (`api_client.py`)
|
||||||
|
|
||||||
|
Using vLLM's OpenAI-compatible REST API.
|
||||||
|
|
||||||
|
**Prerequisites:**
|
||||||
|
Start the vLLM server first:
|
||||||
|
```bash
|
||||||
|
cd ~/vllm-install
|
||||||
|
./vllm-serve.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python api_client.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- Listing available models
|
||||||
|
- Simple text completion
|
||||||
|
- Chat completion
|
||||||
|
- Streaming responses
|
||||||
|
- HTTP API interaction
|
||||||
|
|
||||||
|
### 3. Batch Processing (`batch_processing.py`)
|
||||||
|
|
||||||
|
Efficient processing of large batches of prompts.
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python batch_processing.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**What it demonstrates:**
|
||||||
|
- High-throughput batch inference
|
||||||
|
- Dynamic batching
|
||||||
|
- Memory-efficient processing
|
||||||
|
- Performance monitoring
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
### Change Model
|
||||||
|
|
||||||
|
Edit the model name in any example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
llm = LLM(
|
||||||
|
model="meta-llama/Llama-3.1-8B-Instruct", # Change this
|
||||||
|
trust_remote_code=True,
|
||||||
|
gpu_memory_utilization=0.9
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adjust Sampling Parameters
|
||||||
|
|
||||||
|
Modify `SamplingParams` for different generation behavior:
|
||||||
|
|
||||||
|
```python
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
temperature=0.7, # Lower = more deterministic (0.0-1.0)
|
||||||
|
top_p=0.95, # Nucleus sampling threshold
|
||||||
|
max_tokens=100, # Maximum tokens to generate
|
||||||
|
top_k=50, # Top-k sampling
|
||||||
|
repetition_penalty=1.1 # Penalize repetition
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU Memory Management
|
||||||
|
|
||||||
|
Adjust memory utilization:
|
||||||
|
|
||||||
|
```python
|
||||||
|
llm = LLM(
|
||||||
|
model="...",
|
||||||
|
gpu_memory_utilization=0.9, # Use 90% of GPU memory (0.0-1.0)
|
||||||
|
max_model_len=2048 # Maximum sequence length
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Server Examples
|
||||||
|
|
||||||
|
### cURL Examples
|
||||||
|
|
||||||
|
**List models:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/models
|
||||||
|
```
|
||||||
|
|
||||||
|
**Simple completion:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
"prompt": "The meaning of life is",
|
||||||
|
"max_tokens": 50,
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Chat completion:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "What is DGX Spark?"}
|
||||||
|
],
|
||||||
|
"max_tokens": 100,
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Streaming completion:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/v1/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
"prompt": "Write a story about",
|
||||||
|
"max_tokens": 100,
|
||||||
|
"stream": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tested Models
|
||||||
|
|
||||||
|
These models work well on DGX Spark GB10:
|
||||||
|
|
||||||
|
- `Qwen/Qwen2.5-0.5B-Instruct` (small, fast)
|
||||||
|
- `Qwen/Qwen2.5-7B-Instruct` (balanced)
|
||||||
|
- `meta-llama/Llama-3.1-8B-Instruct` (high quality)
|
||||||
|
- `meta-llama/Llama-3.1-70B-Instruct` (requires tensor parallelism)
|
||||||
|
|
||||||
|
## Performance Tips
|
||||||
|
|
||||||
|
1. **Use GPU memory efficiently:**
|
||||||
|
- Set `gpu_memory_utilization=0.95` for maximum throughput
|
||||||
|
- Lower for models close to GPU memory limit
|
||||||
|
|
||||||
|
2. **Batch processing:**
|
||||||
|
- Process multiple prompts together
|
||||||
|
- vLLM automatically optimizes batch sizes
|
||||||
|
|
||||||
|
3. **Quantization:**
|
||||||
|
- For larger models, use quantization:
|
||||||
|
```python
|
||||||
|
llm = LLM(model="...", quantization="awq")
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Tensor parallelism:**
|
||||||
|
- For models > 20GB, use multiple GPUs:
|
||||||
|
```python
|
||||||
|
llm = LLM(model="...", tensor_parallel_size=2)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Out of Memory
|
||||||
|
|
||||||
|
Reduce `max_model_len` or `gpu_memory_utilization`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
llm = LLM(
|
||||||
|
model="...",
|
||||||
|
gpu_memory_utilization=0.8,
|
||||||
|
max_model_len=2048
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Slow Generation
|
||||||
|
|
||||||
|
Check if model is loaded correctly:
|
||||||
|
|
||||||
|
```python
|
||||||
|
python -c "import vllm; print(vllm.__version__)"
|
||||||
|
nvidia-smi # Check GPU utilization
|
||||||
|
```
|
||||||
|
|
||||||
|
### Connection Refused (API)
|
||||||
|
|
||||||
|
Ensure server is running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/vllm-install
|
||||||
|
./vllm-status.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## More Resources
|
||||||
|
|
||||||
|
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||||
|
- [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
|
||||||
|
- [Main README](../README.md)
|
||||||
|
- [Cluster Setup](../CLUSTER.md)
|
||||||
160
examples/api_client.py
Normal file
160
examples/api_client.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
vLLM OpenAI-Compatible API Client Example
|
||||||
|
Demonstrates using vLLM's OpenAI-compatible API endpoints
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
class VLLMClient:
|
||||||
|
"""Simple client for vLLM OpenAI-compatible API"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = "http://localhost:8000"):
|
||||||
|
self.base_url = base_url.rstrip('/')
|
||||||
|
|
||||||
|
def list_models(self) -> List[Dict]:
|
||||||
|
"""List available models"""
|
||||||
|
response = requests.get(f"{self.base_url}/v1/models")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def complete(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
model: str = None,
|
||||||
|
max_tokens: int = 100,
|
||||||
|
temperature: float = 0.7,
|
||||||
|
stream: bool = False
|
||||||
|
) -> Dict:
|
||||||
|
"""Generate completion"""
|
||||||
|
|
||||||
|
# Get model name if not specified
|
||||||
|
if model is None:
|
||||||
|
models = self.list_models()
|
||||||
|
model = models['data'][0]['id']
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": temperature,
|
||||||
|
"stream": stream
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/v1/completions",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return response.iter_lines()
|
||||||
|
else:
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def chat(
|
||||||
|
self,
|
||||||
|
messages: List[Dict[str, str]],
|
||||||
|
model: str = None,
|
||||||
|
max_tokens: int = 100,
|
||||||
|
temperature: float = 0.7,
|
||||||
|
stream: bool = False
|
||||||
|
) -> Dict:
|
||||||
|
"""Generate chat completion"""
|
||||||
|
|
||||||
|
# Get model name if not specified
|
||||||
|
if model is None:
|
||||||
|
models = self.list_models()
|
||||||
|
model = models['data'][0]['id']
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"temperature": temperature,
|
||||||
|
"stream": stream
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return response.iter_lines()
|
||||||
|
else:
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Initialize client
|
||||||
|
client = VLLMClient("http://localhost:8000")
|
||||||
|
|
||||||
|
print("="*60)
|
||||||
|
print("vLLM API Client Examples")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Example 1: List models
|
||||||
|
print("\n1. Listing available models...")
|
||||||
|
models = client.list_models()
|
||||||
|
for model in models['data']:
|
||||||
|
print(f" - {model['id']}")
|
||||||
|
|
||||||
|
# Example 2: Simple completion
|
||||||
|
print("\n2. Simple completion...")
|
||||||
|
result = client.complete(
|
||||||
|
prompt="The capital of France is",
|
||||||
|
max_tokens=10,
|
||||||
|
temperature=0.0
|
||||||
|
)
|
||||||
|
print(f" Prompt: The capital of France is")
|
||||||
|
print(f" Response: {result['choices'][0]['text']}")
|
||||||
|
|
||||||
|
# Example 3: Chat completion
|
||||||
|
print("\n3. Chat completion...")
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are a helpful AI assistant."},
|
||||||
|
{"role": "user", "content": "What is the Blackwell GPU architecture?"}
|
||||||
|
]
|
||||||
|
result = client.chat(
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=100,
|
||||||
|
temperature=0.7
|
||||||
|
)
|
||||||
|
print(f" User: {messages[1]['content']}")
|
||||||
|
print(f" Assistant: {result['choices'][0]['message']['content']}")
|
||||||
|
|
||||||
|
# Example 4: Streaming completion
|
||||||
|
print("\n4. Streaming completion...")
|
||||||
|
print(" Prompt: Write a short poem about AI")
|
||||||
|
print(" Response: ", end="", flush=True)
|
||||||
|
|
||||||
|
stream = client.complete(
|
||||||
|
prompt="Write a short poem about AI",
|
||||||
|
max_tokens=50,
|
||||||
|
temperature=0.8,
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in stream:
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
data = json.loads(line.decode('utf-8').removeprefix('data: '))
|
||||||
|
if 'choices' in data and len(data['choices']) > 0:
|
||||||
|
token = data['choices'][0].get('text', '')
|
||||||
|
print(token, end="", flush=True)
|
||||||
|
except (json.JSONDecodeError, AttributeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
48
examples/basic_inference.py
Normal file
48
examples/basic_inference.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Basic vLLM Inference Example for DGX Spark
|
||||||
|
Demonstrates simple text generation using the vLLM Python API
|
||||||
|
"""
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Initialize the model
|
||||||
|
# Use a smaller model for testing, replace with your preferred model
|
||||||
|
print("Loading model...")
|
||||||
|
llm = LLM(
|
||||||
|
model="Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
|
trust_remote_code=True,
|
||||||
|
gpu_memory_utilization=0.9,
|
||||||
|
max_model_len=2048
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define prompts
|
||||||
|
prompts = [
|
||||||
|
"What is the NVIDIA DGX Spark?",
|
||||||
|
"Explain the Blackwell GPU architecture in simple terms.",
|
||||||
|
"Write a haiku about artificial intelligence."
|
||||||
|
]
|
||||||
|
|
||||||
|
# Configure sampling parameters
|
||||||
|
sampling_params = SamplingParams(
|
||||||
|
temperature=0.7,
|
||||||
|
top_p=0.95,
|
||||||
|
max_tokens=100,
|
||||||
|
stop=["</s>", "\n\n\n"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate responses
|
||||||
|
print("\nGenerating responses...\n")
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
for i, output in enumerate(outputs):
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Prompt {i+1}: {prompts[i]}")
|
||||||
|
print(f"{'-'*60}")
|
||||||
|
print(f"Response: {output.outputs[0].text}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
777
install.sh
Normal file
777
install.sh
Normal file
@@ -0,0 +1,777 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
################################################################################
|
||||||
|
# vLLM Installation Script for NVIDIA DGX Spark (Blackwell GB10)
|
||||||
|
# Version: 1.1.0
|
||||||
|
# Author: DGX Spark Community
|
||||||
|
# License: MIT
|
||||||
|
#
|
||||||
|
# This script automates the complete installation of vLLM on DGX Spark systems
|
||||||
|
# with Blackwell GB10 GPUs, including all necessary fixes and optimizations.
|
||||||
|
#
|
||||||
|
# Usage: ./install.sh [OPTIONS]
|
||||||
|
# Can also be run via: curl -fsSL <url>/install.sh | bash
|
||||||
|
#
|
||||||
|
# Options:
|
||||||
|
# --install-dir DIR Installation directory (default: $PWD/vllm-install)
|
||||||
|
# --vllm-version HASH vLLM git commit (default: 66a168a19 - tested with Blackwell)
|
||||||
|
# --python-version VER Python version (default: 3.12)
|
||||||
|
# --skip-tests Skip post-installation tests
|
||||||
|
# --help Show this help message
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
set -e # Exit on error
|
||||||
|
set -o pipefail # Catch errors in pipes
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Default configuration
|
||||||
|
INSTALL_DIR="$PWD/vllm-install"
|
||||||
|
VLLM_VERSION="66a168a197ba214a5b70a74fa2e713c9eeb3251a" # vLLM commit with Blackwell fixes
|
||||||
|
TRITON_VERSION="4caa0328bf8df64896dd5f6fb9df41b0eb2e750a" # Triton commit that works with Blackwell
|
||||||
|
PYTHON_VERSION="3.12"
|
||||||
|
SKIP_TESTS=false
|
||||||
|
|
||||||
|
# GitHub raw URL for downloading repo assets when run outside the repo
|
||||||
|
REPO_RAW_URL="https://raw.githubusercontent.com/eelbaz/dgx-spark-vllm-setup/main"
|
||||||
|
|
||||||
|
# Script directory (only meaningful when run from a local clone)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" 2>/dev/null && pwd || echo "")"
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Helper Functions
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_header() {
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo -e "${BLUE}$1${NC}"
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
check_command() {
|
||||||
|
if command -v "$1" &> /dev/null; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Auto-confirm when stdin is not a terminal (e.g. curl | bash)
|
||||||
|
confirm_or_default_yes() {
|
||||||
|
local prompt="$1"
|
||||||
|
if [ -t 0 ]; then
|
||||||
|
read -p "$prompt (y/N) " -n 1 -r
|
||||||
|
echo
|
||||||
|
[[ $REPLY =~ ^[Yy]$ ]]
|
||||||
|
else
|
||||||
|
log_info "Non-interactive mode: auto-confirming"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Pre-flight Checks
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
preflight_checks() {
|
||||||
|
print_header "Pre-flight System Checks"
|
||||||
|
|
||||||
|
log_info "Checking system requirements..."
|
||||||
|
|
||||||
|
# Check if running on ARM64
|
||||||
|
ARCH=$(uname -m)
|
||||||
|
if [[ "$ARCH" != "aarch64" ]] && [[ "$ARCH" != "arm64" ]]; then
|
||||||
|
log_warning "This script is designed for ARM64 architecture (DGX Spark)"
|
||||||
|
log_warning "Detected architecture: $ARCH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for NVIDIA GPU
|
||||||
|
if ! check_command nvidia-smi; then
|
||||||
|
log_error "nvidia-smi not found. NVIDIA drivers required."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check GPU type
|
||||||
|
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
|
||||||
|
log_info "Detected GPU: $GPU_NAME"
|
||||||
|
|
||||||
|
if [[ ! "$GPU_NAME" =~ "GB10" ]]; then
|
||||||
|
log_warning "This script is optimized for NVIDIA GB10 (Blackwell)"
|
||||||
|
log_warning "Your GPU: $GPU_NAME"
|
||||||
|
if ! confirm_or_default_yes "Continue anyway?"; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check CUDA
|
||||||
|
if ! check_command nvcc; then
|
||||||
|
# Check common CUDA install locations
|
||||||
|
if [ -x "/usr/local/cuda/bin/nvcc" ]; then
|
||||||
|
export PATH="/usr/local/cuda/bin:$PATH"
|
||||||
|
log_info "Found CUDA at /usr/local/cuda, added to PATH"
|
||||||
|
else
|
||||||
|
log_error "CUDA toolkit not found. Please install CUDA 13.0+"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | cut -d',' -f1)
|
||||||
|
log_info "CUDA version: $CUDA_VERSION"
|
||||||
|
|
||||||
|
# Check for Python development headers (required for Triton build)
|
||||||
|
PYTHON_INCLUDE="/usr/include/python${PYTHON_VERSION}/patchlevel.h"
|
||||||
|
if [ ! -f "$PYTHON_INCLUDE" ]; then
|
||||||
|
log_warning "Python ${PYTHON_VERSION} development headers not found"
|
||||||
|
log_info "Installing python${PYTHON_VERSION}-dev (requires sudo)..."
|
||||||
|
if sudo apt-get install -y "python${PYTHON_VERSION}-dev"; then
|
||||||
|
log_success "python${PYTHON_VERSION}-dev installed"
|
||||||
|
else
|
||||||
|
log_error "Failed to install python${PYTHON_VERSION}-dev"
|
||||||
|
log_error "Please install manually: sudo apt install python${PYTHON_VERSION}-dev"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_info "Python ${PYTHON_VERSION} development headers found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check disk space (need ~50GB)
|
||||||
|
AVAILABLE_SPACE=$(df -BG "$HOME" | tail -1 | awk '{print $4}' | sed 's/G//')
|
||||||
|
if [[ "$AVAILABLE_SPACE" -lt 50 ]]; then
|
||||||
|
log_error "Insufficient disk space. Need at least 50GB, have ${AVAILABLE_SPACE}GB"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "Pre-flight checks passed!"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Install uv Package Manager
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
install_uv() {
|
||||||
|
print_header "Step 1/8: Installing uv Package Manager"
|
||||||
|
|
||||||
|
if check_command uv; then
|
||||||
|
UV_VERSION=$(uv --version | awk '{print $2}')
|
||||||
|
log_info "uv already installed: v$UV_VERSION"
|
||||||
|
else
|
||||||
|
log_info "Installing uv..."
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
export PATH="$HOME/.local/bin:$PATH"
|
||||||
|
log_success "uv installed successfully"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
if ! check_command uv; then
|
||||||
|
log_error "uv installation failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Create Python Virtual Environment
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
create_venv() {
|
||||||
|
print_header "Step 2/8: Creating Python Virtual Environment"
|
||||||
|
|
||||||
|
VENV_DIR="$INSTALL_DIR/.vllm"
|
||||||
|
|
||||||
|
if [ -d "$VENV_DIR" ]; then
|
||||||
|
log_warning "Virtual environment already exists at $VENV_DIR"
|
||||||
|
if confirm_or_default_yes "Remove and recreate?"; then
|
||||||
|
rm -rf "$VENV_DIR"
|
||||||
|
else
|
||||||
|
log_info "Using existing virtual environment"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Creating Python $PYTHON_VERSION virtual environment..."
|
||||||
|
mkdir -p "$INSTALL_DIR"
|
||||||
|
cd "$INSTALL_DIR"
|
||||||
|
uv venv .vllm --python "$PYTHON_VERSION"
|
||||||
|
|
||||||
|
# Upgrade setuptools to 77+ so PEP 639 license fields are supported
|
||||||
|
# (fixes flashinfer-python build failure)
|
||||||
|
log_info "Upgrading setuptools in venv for PEP 639 license support..."
|
||||||
|
uv pip install --python "$VENV_DIR/bin/python" --upgrade setuptools
|
||||||
|
|
||||||
|
log_success "Virtual environment created at $VENV_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Install PyTorch
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
install_pytorch() {
|
||||||
|
print_header "Step 3/8: Installing PyTorch with CUDA 13.0"
|
||||||
|
|
||||||
|
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||||
|
|
||||||
|
log_info "Installing latest PyTorch for cu130..."
|
||||||
|
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
||||||
|
|
||||||
|
# Verify PyTorch installation
|
||||||
|
log_info "Verifying PyTorch installation..."
|
||||||
|
python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
|
||||||
|
log_success "PyTorch installed successfully"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Clone and Build Triton
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
install_triton() {
|
||||||
|
print_header "Step 4/8: Installing Triton from Main Branch"
|
||||||
|
|
||||||
|
TRITON_DIR="$INSTALL_DIR/triton"
|
||||||
|
|
||||||
|
if [ -d "$TRITON_DIR" ]; then
|
||||||
|
log_info "Triton directory exists, updating..."
|
||||||
|
cd "$TRITON_DIR"
|
||||||
|
git fetch
|
||||||
|
else
|
||||||
|
log_info "Cloning Triton repository..."
|
||||||
|
cd "$INSTALL_DIR"
|
||||||
|
git clone https://github.com/triton-lang/triton.git
|
||||||
|
cd triton
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Checking out Triton commit $TRITON_VERSION (tested with Blackwell)..."
|
||||||
|
git checkout "$TRITON_VERSION"
|
||||||
|
git submodule update --init --recursive
|
||||||
|
|
||||||
|
log_info "Installing Triton build dependencies..."
|
||||||
|
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||||
|
uv pip install pip cmake ninja pybind11
|
||||||
|
|
||||||
|
log_info "Building Triton (this takes ~5 minutes)..."
|
||||||
|
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
|
||||||
|
python -m pip install --no-build-isolation -v . 2>&1 | tee "$INSTALL_DIR/triton-build.log"
|
||||||
|
|
||||||
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
||||||
|
log_error "Triton build failed. See $INSTALL_DIR/triton-build.log for details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Record the installed triton version so we can protect it later
|
||||||
|
TRITON_INSTALLED_VERSION=$(python -c "import triton; print(triton.__version__)" 2>/dev/null || echo "unknown")
|
||||||
|
log_info "Triton version installed: $TRITON_INSTALLED_VERSION"
|
||||||
|
|
||||||
|
log_success "Triton installed successfully"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Install Additional Dependencies
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
install_dependencies() {
|
||||||
|
print_header "Step 5/8: Installing Additional Dependencies"
|
||||||
|
|
||||||
|
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||||
|
|
||||||
|
log_info "Installing xgrammar, setuptools-scm, and apache-tvm-ffi..."
|
||||||
|
uv pip install xgrammar setuptools-scm apache-tvm-ffi==0.1.0b15 --prerelease=allow
|
||||||
|
|
||||||
|
log_success "Dependencies installed successfully"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Clone vLLM
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
clone_vllm() {
|
||||||
|
print_header "Step 6/8: Cloning vLLM Repository"
|
||||||
|
|
||||||
|
VLLM_DIR="$INSTALL_DIR/vllm"
|
||||||
|
|
||||||
|
if [ -d "$VLLM_DIR" ]; then
|
||||||
|
log_warning "vLLM directory already exists at $VLLM_DIR"
|
||||||
|
if confirm_or_default_yes "Remove and re-clone?"; then
|
||||||
|
rm -rf "$VLLM_DIR"
|
||||||
|
else
|
||||||
|
log_info "Using existing vLLM directory"
|
||||||
|
cd "$VLLM_DIR"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Cloning vLLM $VLLM_VERSION..."
|
||||||
|
cd "$INSTALL_DIR"
|
||||||
|
git clone --recursive https://github.com/vllm-project/vllm.git
|
||||||
|
cd vllm
|
||||||
|
git checkout "$VLLM_VERSION"
|
||||||
|
git submodule update --init --recursive
|
||||||
|
|
||||||
|
log_success "vLLM repository cloned"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Apply Critical Fixes
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
apply_fixes() {
|
||||||
|
print_header "Step 7/8: Applying Critical Fixes"
|
||||||
|
|
||||||
|
cd "$INSTALL_DIR/vllm"
|
||||||
|
|
||||||
|
# Fix 1: pyproject.toml license field
|
||||||
|
log_info "Fixing pyproject.toml license field..."
|
||||||
|
sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' pyproject.toml
|
||||||
|
sed -i '/^license-files = /d' pyproject.toml
|
||||||
|
|
||||||
|
# Fix 2: CMakeLists.txt SM100/SM120 MOE kernels (check if already applied)
|
||||||
|
if grep -q 'cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"' CMakeLists.txt; then
|
||||||
|
log_info "CMakeLists.txt SM100/SM120 fix already applied"
|
||||||
|
else
|
||||||
|
log_info "Applying CMakeLists.txt SM100/SM120 fix..."
|
||||||
|
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f"/' CMakeLists.txt
|
||||||
|
sed -i 's/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a"/cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;12.1a"/' CMakeLists.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fix 3: flashinfer-python license field (pre-emptive fix)
|
||||||
|
log_info "Pre-fixing flashinfer-python license issue..."
|
||||||
|
rm -rf "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Fix 4: GPT-OSS Triton MOE kernels for Qwen3/gpt-oss support
|
||||||
|
# Try local repo patches/ first, then download from GitHub
|
||||||
|
PATCH_FILE=""
|
||||||
|
if [ -f "$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch" ]; then
|
||||||
|
PATCH_FILE="$SCRIPT_DIR/patches/gpt_oss_triton_moe.patch"
|
||||||
|
else
|
||||||
|
log_info "Downloading GPT-OSS Triton MOE patch from repository..."
|
||||||
|
PATCH_FILE="$INSTALL_DIR/gpt_oss_triton_moe.patch"
|
||||||
|
if curl -fsSL "$REPO_RAW_URL/patches/gpt_oss_triton_moe.patch" -o "$PATCH_FILE" 2>/dev/null; then
|
||||||
|
log_info "Patch downloaded successfully"
|
||||||
|
else
|
||||||
|
PATCH_FILE=""
|
||||||
|
log_warning "Could not download GPT-OSS Triton MOE patch (skipping)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$PATCH_FILE" ] && [ -f "$PATCH_FILE" ]; then
|
||||||
|
log_info "Applying GPT-OSS Triton MOE kernel patch for Qwen3/gpt-oss support..."
|
||||||
|
if patch --dry-run -p1 < "$PATCH_FILE" > /dev/null 2>&1; then
|
||||||
|
patch -p1 < "$PATCH_FILE"
|
||||||
|
log_success "GPT-OSS Triton MOE kernel patch applied"
|
||||||
|
else
|
||||||
|
log_warning "GPT-OSS Triton MOE kernel patch already applied or conflicts"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Configure use_existing_torch
|
||||||
|
log_info "Configuring vLLM to use existing PyTorch..."
|
||||||
|
python3 use_existing_torch.py
|
||||||
|
|
||||||
|
log_success "All fixes applied successfully"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Build and Install vLLM
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
build_vllm() {
|
||||||
|
print_header "Step 8/8: Building vLLM (15-20 minutes)"
|
||||||
|
|
||||||
|
cd "$INSTALL_DIR/vllm"
|
||||||
|
source "$INSTALL_DIR/.vllm/bin/activate"
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
export TORCH_CUDA_ARCH_LIST=12.1a
|
||||||
|
export VLLM_USE_FLASHINFER_MXFP4_MOE=1
|
||||||
|
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
|
|
||||||
|
# Create a constraints file to prevent uv from replacing our
|
||||||
|
# custom-built Triton with a PyPI version
|
||||||
|
log_info "Creating constraints to protect pinned Triton build..."
|
||||||
|
TRITON_CONSTRAINT="$INSTALL_DIR/constraints.txt"
|
||||||
|
TRITON_INSTALLED=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "")
|
||||||
|
if [ -n "$TRITON_INSTALLED" ]; then
|
||||||
|
echo "triton==${TRITON_INSTALLED}" > "$TRITON_CONSTRAINT"
|
||||||
|
log_info "Pinning triton==${TRITON_INSTALLED} during vLLM build"
|
||||||
|
else
|
||||||
|
echo "" > "$TRITON_CONSTRAINT"
|
||||||
|
log_warning "Could not detect installed Triton version"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Starting vLLM build..."
|
||||||
|
log_warning "This will take 15-20 minutes. Go grab a coffee!"
|
||||||
|
|
||||||
|
set +e # Don't exit on error, we'll handle it
|
||||||
|
UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
|
||||||
|
--no-build-isolation --prerelease=allow -e . \
|
||||||
|
2>&1 | tee "$INSTALL_DIR/vllm-build.log"
|
||||||
|
BUILD_STATUS=${PIPESTATUS[0]}
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ $BUILD_STATUS -ne 0 ]; then
|
||||||
|
if grep -q "flashinfer.*license.*must be valid" "$INSTALL_DIR/vllm-build.log"; then
|
||||||
|
log_warning "Build failed due to flashinfer-python license issue"
|
||||||
|
log_info "Upgrading setuptools and retrying..."
|
||||||
|
|
||||||
|
# Ensure setuptools is new enough
|
||||||
|
uv pip install --upgrade setuptools
|
||||||
|
|
||||||
|
# Also patch the cached flashinfer pyproject.toml as a belt-and-suspenders fix
|
||||||
|
find "$HOME/.cache/uv/sdists-v9/pypi/flashinfer-python" -name "pyproject.toml" 2>/dev/null | while read f; do
|
||||||
|
sed -i 's/^license = "Apache-2.0"$/license = {text = "Apache-2.0"}/' "$f"
|
||||||
|
sed -i '/^license-files = /d' "$f"
|
||||||
|
done
|
||||||
|
|
||||||
|
log_info "Retrying vLLM build..."
|
||||||
|
UV_CONSTRAINT="$TRITON_CONSTRAINT" uv pip install \
|
||||||
|
--no-build-isolation --prerelease=allow -e .
|
||||||
|
else
|
||||||
|
log_error "vLLM build failed. See $INSTALL_DIR/vllm-build.log for details"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify Triton wasn't replaced
|
||||||
|
TRITON_AFTER=$(python -c "import importlib.metadata; print(importlib.metadata.version('triton'))" 2>/dev/null || echo "unknown")
|
||||||
|
if [ -n "$TRITON_INSTALLED" ] && [ "$TRITON_AFTER" != "$TRITON_INSTALLED" ]; then
|
||||||
|
log_warning "Triton was changed during vLLM install: $TRITON_INSTALLED -> $TRITON_AFTER"
|
||||||
|
log_warning "Rebuilding pinned Triton from source..."
|
||||||
|
cd "$INSTALL_DIR/triton"
|
||||||
|
git checkout "$TRITON_VERSION"
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
|
||||||
|
python -m pip install --no-build-isolation --force-reinstall -v .
|
||||||
|
cd "$INSTALL_DIR/vllm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "vLLM built successfully!"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Create Helper Scripts
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
create_helper_scripts() {
|
||||||
|
print_header "Creating Helper Scripts"
|
||||||
|
|
||||||
|
# Create environment activation script
|
||||||
|
log_info "Creating vllm_env.sh..."
|
||||||
|
cat > "$INSTALL_DIR/vllm_env.sh" << 'ENVEOF'
|
||||||
|
#!/bin/bash
|
||||||
|
# vLLM Environment Configuration for DGX Spark
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
source "$SCRIPT_DIR/.vllm/bin/activate"
|
||||||
|
export TORCH_CUDA_ARCH_LIST=12.1a
|
||||||
|
export VLLM_USE_FLASHINFER_MXFP4_MOE=1
|
||||||
|
CUDA_PATH=$(ls -d /usr/local/cuda* 2>/dev/null | head -1)
|
||||||
|
export TRITON_PTXAS_PATH="$CUDA_PATH/bin/ptxas"
|
||||||
|
export PATH="$CUDA_PATH/bin:$PATH"
|
||||||
|
export LD_LIBRARY_PATH="$CUDA_PATH/lib64:$LD_LIBRARY_PATH"
|
||||||
|
# Cache tiktoken encodings to avoid re-downloading
|
||||||
|
export TIKTOKEN_CACHE_DIR="$SCRIPT_DIR/.tiktoken_cache"
|
||||||
|
mkdir -p "$TIKTOKEN_CACHE_DIR"
|
||||||
|
echo "=== vLLM Environment Active ==="
|
||||||
|
echo "Virtual env: $VIRTUAL_ENV"
|
||||||
|
echo "CUDA arch: $TORCH_CUDA_ARCH_LIST"
|
||||||
|
echo "Python: $(which python)"
|
||||||
|
echo "==============================="
|
||||||
|
ENVEOF
|
||||||
|
chmod +x "$INSTALL_DIR/vllm_env.sh"
|
||||||
|
|
||||||
|
# Create vllm-serve.sh (embedded so it works with curl|bash)
|
||||||
|
log_info "Creating vllm-serve.sh..."
|
||||||
|
cat > "$INSTALL_DIR/vllm-serve.sh" << 'SERVEEOF'
|
||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Startup Script for DGX Spark
|
||||||
|
# Usage: ./vllm-serve.sh <model_name> [port]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
|
||||||
|
PORT="${2:-8000}"
|
||||||
|
VLLM_DIR="$SCRIPT_DIR/vllm"
|
||||||
|
ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||||
|
|
||||||
|
# Check if server is already running
|
||||||
|
if [ -f "$PID_FILE" ]; then
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "ERROR: vLLM server is already running (PID: $PID)"
|
||||||
|
echo "Use ./vllm-stop.sh to stop it first"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Source environment
|
||||||
|
source "$ENV_SCRIPT"
|
||||||
|
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
echo "Starting vLLM Server on DGX Spark"
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
echo "Model: $MODEL"
|
||||||
|
echo "Port: $PORT"
|
||||||
|
echo "Log file: $LOG_FILE"
|
||||||
|
echo "PID file: $PID_FILE"
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
|
||||||
|
# Start server in background
|
||||||
|
cd "$VLLM_DIR"
|
||||||
|
nohup python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port "$PORT" \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
> "$LOG_FILE" 2>&1 &
|
||||||
|
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
echo "OK: Server started with PID: $(cat $PID_FILE)"
|
||||||
|
echo "OK: Waiting for server to be ready..."
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server is running!"
|
||||||
|
echo ""
|
||||||
|
echo "Test with: curl http://localhost:$PORT/v1/models"
|
||||||
|
echo "View logs: tail -f $LOG_FILE"
|
||||||
|
echo "Stop server: ./vllm-stop.sh"
|
||||||
|
else
|
||||||
|
echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
SERVEEOF
|
||||||
|
chmod +x "$INSTALL_DIR/vllm-serve.sh"
|
||||||
|
|
||||||
|
# Create vllm-stop.sh
|
||||||
|
log_info "Creating vllm-stop.sh..."
|
||||||
|
cat > "$INSTALL_DIR/vllm-stop.sh" << 'STOPEOF'
|
||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Stop Script for DGX Spark
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
echo "No vLLM server PID file found. Server may not be running."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Stopping vLLM server (PID: $PID)..."
|
||||||
|
kill $PID
|
||||||
|
|
||||||
|
for i in {1..10}; do
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server stopped successfully"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
if ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "Server did not stop gracefully. Force killing..."
|
||||||
|
kill -9 $PID
|
||||||
|
sleep 1
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server force stopped"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
else
|
||||||
|
echo "ERROR: Failed to stop server"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
STOPEOF
|
||||||
|
chmod +x "$INSTALL_DIR/vllm-stop.sh"
|
||||||
|
|
||||||
|
# Create vllm-status.sh
|
||||||
|
log_info "Creating vllm-status.sh..."
|
||||||
|
cat > "$INSTALL_DIR/vllm-status.sh" << 'STATUSEOF'
|
||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Status Script for DGX Spark
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||||
|
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
echo "vLLM Server Status on DGX Spark"
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
echo "Status: NOT RUNNING (no PID file found)"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "Status: NOT RUNNING (stale PID file)"
|
||||||
|
echo "Cleaning up PID file..."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Status: RUNNING"
|
||||||
|
echo "PID: $PID"
|
||||||
|
echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
|
||||||
|
echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
|
||||||
|
echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ -f "$LOG_FILE" ]; then
|
||||||
|
echo "Recent log entries (last 10 lines):"
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
tail -n 10 "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "Log file not found: $LOG_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "----------------------------------------------------------------------"
|
||||||
|
STATUSEOF
|
||||||
|
chmod +x "$INSTALL_DIR/vllm-status.sh"
|
||||||
|
|
||||||
|
log_success "Helper scripts created in $INSTALL_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Post-Installation Tests
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
run_tests() {
|
||||||
|
if [ "$SKIP_TESTS" = true ]; then
|
||||||
|
log_info "Skipping post-installation tests"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_header "Post-Installation Tests"
|
||||||
|
|
||||||
|
source "$INSTALL_DIR/vllm_env.sh"
|
||||||
|
|
||||||
|
log_info "Test 1: Import vLLM..."
|
||||||
|
python -c "import vllm; print('vLLM version:', vllm.__version__)"
|
||||||
|
|
||||||
|
log_info "Test 2: Check CUDA availability..."
|
||||||
|
python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print('CUDA available')"
|
||||||
|
|
||||||
|
log_info "Test 3: Check GPU detection..."
|
||||||
|
python -c "import torch; print('GPU count:', torch.cuda.device_count()); print('GPU name:', torch.cuda.get_device_name(0))"
|
||||||
|
|
||||||
|
log_success "All tests passed!"
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Parse Command Line Arguments
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
parse_args() {
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--install-dir)
|
||||||
|
INSTALL_DIR="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--vllm-version)
|
||||||
|
VLLM_VERSION="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--python-version)
|
||||||
|
PYTHON_VERSION="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--skip-tests)
|
||||||
|
SKIP_TESTS=true
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
head -20 "$0" | grep "^#" | sed 's/^# //'
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log_error "Unknown option: $1"
|
||||||
|
log_info "Use --help for usage information"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Main Installation Flow
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
main() {
|
||||||
|
parse_args "$@"
|
||||||
|
|
||||||
|
print_header "vLLM Installation for DGX Spark (Blackwell GB10)"
|
||||||
|
log_info "Installation directory: $INSTALL_DIR"
|
||||||
|
log_info "vLLM version: $VLLM_VERSION"
|
||||||
|
log_info "Python version: $PYTHON_VERSION"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
preflight_checks
|
||||||
|
install_uv
|
||||||
|
create_venv
|
||||||
|
install_pytorch
|
||||||
|
install_triton
|
||||||
|
install_dependencies
|
||||||
|
clone_vllm
|
||||||
|
apply_fixes
|
||||||
|
build_vllm
|
||||||
|
create_helper_scripts
|
||||||
|
run_tests
|
||||||
|
|
||||||
|
print_header "Installation Complete!"
|
||||||
|
echo ""
|
||||||
|
log_success "vLLM has been successfully installed!"
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}Next steps:${NC}"
|
||||||
|
echo "1. Activate the environment:"
|
||||||
|
echo " ${BLUE}source $INSTALL_DIR/vllm_env.sh${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "2. Start vLLM server:"
|
||||||
|
echo " ${BLUE}cd $INSTALL_DIR${NC}"
|
||||||
|
echo " ${BLUE}./vllm-serve.sh${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "3. Test the API:"
|
||||||
|
echo " ${BLUE}curl http://localhost:8000/v1/models${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "For more information, see README.md"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main "$@"
|
||||||
77
patches/gpt_oss_triton_moe.patch
Normal file
77
patches/gpt_oss_triton_moe.patch
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||||
|
index badedfc54..e05c0eea4 100644
|
||||||
|
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||||
|
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
|
||||||
|
@@ -20,9 +20,16 @@ logger = init_logger(__name__)
|
||||||
|
if has_triton_kernels():
|
||||||
|
try:
|
||||||
|
import triton_kernels.swiglu
|
||||||
|
- from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs
|
||||||
|
- from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix
|
||||||
|
- from triton_kernels.tensor import Bitmatrix
|
||||||
|
+ from triton_kernels.matmul_ogs import (
|
||||||
|
+ FnSpecs,
|
||||||
|
+ FusedActivation,
|
||||||
|
+ GatherIndx,
|
||||||
|
+ RoutingData,
|
||||||
|
+ ScatterIndx,
|
||||||
|
+ matmul_ogs,
|
||||||
|
+ )
|
||||||
|
+ from triton_kernels.tensor import BIT, Bitmatrix, SparseMatrix, make_ragged_tensor_metadata
|
||||||
|
+ from triton_kernels.topk import topk as triton_topk
|
||||||
|
except (AttributeError, ImportError) as e:
|
||||||
|
logger.error(
|
||||||
|
"Failed to import Triton kernels. Please make sure your triton "
|
||||||
|
@@ -84,8 +91,17 @@ def triton_kernel_moe_forward(
|
||||||
|
global_num_experts: int = -1,
|
||||||
|
expert_map: torch.Tensor | None = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
- routing_data, gather_idx, scatter_idx = routing(
|
||||||
|
- gating_output, topk, sm_first=not renormalize
|
||||||
|
+ # Use new topk API instead of deprecated routing
|
||||||
|
+ sm_first = not renormalize
|
||||||
|
+ if sm_first:
|
||||||
|
+ gating_output = torch.softmax(gating_output, dim=-1)
|
||||||
|
+ sparse_logits = triton_topk(
|
||||||
|
+ gating_output, topk, apply_softmax=not sm_first, y_indx=None, n_rows=None
|
||||||
|
+ )
|
||||||
|
+
|
||||||
|
+ # Convert to routing data using the existing make_routing_data function
|
||||||
|
+ routing_data, gather_idx, scatter_idx = make_routing_data(
|
||||||
|
+ sparse_logits.indx, sparse_logits.vals, gating_output.shape[-1]
|
||||||
|
)
|
||||||
|
|
||||||
|
return triton_kernel_fused_experts(
|
||||||
|
@@ -202,14 +218,29 @@ def make_routing_data(
|
||||||
|
bitmatrix_shape = [n_rows, bm_cols * 32]
|
||||||
|
bitmatrix_shape_max = [n_rows, None]
|
||||||
|
bitmatrix = Bitmatrix(
|
||||||
|
- bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None
|
||||||
|
+ bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max
|
||||||
|
)
|
||||||
|
|
||||||
|
# matmul_ogs expects invalid topk_weights to be -1s
|
||||||
|
topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights)
|
||||||
|
- routing_data, gather_indx, scatter_indx = routing_from_bitmatrix(
|
||||||
|
- bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk
|
||||||
|
+
|
||||||
|
+ # Use new SparseMatrix API instead of deprecated routing_from_bitmatrix
|
||||||
|
+ sparse_logits = SparseMatrix(indx=topk_ids, vals=topk_weights, mask=bitmatrix)
|
||||||
|
+ dispatch_indx = sparse_logits.mask_metadata.col_sorted_indx
|
||||||
|
+ combine_indx = sparse_logits.mask_metadata.row_sorted_indx
|
||||||
|
+ ragged_batch_metadata = make_ragged_tensor_metadata(
|
||||||
|
+ sparse_logits.mask_metadata.col_sum, dispatch_indx.shape[0]
|
||||||
|
+ )
|
||||||
|
+ gate_scal = sparse_logits.vals.flatten()[combine_indx]
|
||||||
|
+ routing_data = RoutingData(
|
||||||
|
+ gate_scal,
|
||||||
|
+ ragged_batch_metadata.block_sizes,
|
||||||
|
+ num_local_experts,
|
||||||
|
+ num_topk,
|
||||||
|
+ ragged_batch_metadata,
|
||||||
|
)
|
||||||
|
+ gather_indx = GatherIndx(combine_indx, dispatch_indx)
|
||||||
|
+ scatter_indx = ScatterIndx(dispatch_indx, combine_indx)
|
||||||
|
|
||||||
|
return routing_data, gather_indx, scatter_indx
|
||||||
|
|
||||||
28
requirements.txt
Normal file
28
requirements.txt
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Core Dependencies for vLLM on DGX Spark (Blackwell GB10)
|
||||||
|
# Note: This file is for reference only. The install.sh script handles
|
||||||
|
# all dependency installation with proper ordering and build flags.
|
||||||
|
|
||||||
|
# PyTorch with CUDA 13.0 support (installs latest available on cu130 index)
|
||||||
|
--index-url https://download.pytorch.org/whl/cu130
|
||||||
|
torch
|
||||||
|
torchvision
|
||||||
|
torchaudio
|
||||||
|
|
||||||
|
# Triton (must be built from source - see install.sh)
|
||||||
|
# Pinned to commit 4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
|
||||||
|
# triton @ git+https://github.com/triton-lang/triton.git@4caa0328
|
||||||
|
|
||||||
|
# vLLM dependencies
|
||||||
|
xgrammar>=0.1.26
|
||||||
|
setuptools-scm>=9.2.2
|
||||||
|
setuptools>=77.0.0 # Required for PEP 639 license field support
|
||||||
|
apache-tvm-ffi==0.1.0b15 # Pre-release required
|
||||||
|
|
||||||
|
# Build dependencies
|
||||||
|
pybind11>=3.0.0
|
||||||
|
ninja>=1.13.0
|
||||||
|
|
||||||
|
# Optional but recommended
|
||||||
|
flashinfer-python>=0.4.1
|
||||||
|
transformers>=4.57.0
|
||||||
|
huggingface-hub>=0.36.0
|
||||||
67
scripts/vllm-serve.sh
Normal file
67
scripts/vllm-serve.sh
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Startup Script for DGX Spark
|
||||||
|
# Usage: ./vllm-serve.sh <model_name> [port]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Determine installation directory (where this script is located)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
MODEL="${1:-Qwen/Qwen2.5-0.5B-Instruct}"
|
||||||
|
PORT="${2:-8000}"
|
||||||
|
VLLM_DIR="$SCRIPT_DIR/vllm"
|
||||||
|
ENV_SCRIPT="$SCRIPT_DIR/vllm_env.sh"
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||||
|
|
||||||
|
# Check if server is already running
|
||||||
|
if [ -f "$PID_FILE" ]; then
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "ERROR: vLLM server is already running (PID: $PID)"
|
||||||
|
echo "Use ./vllm-stop.sh to stop it first"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Source environment
|
||||||
|
source "$ENV_SCRIPT"
|
||||||
|
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
|
echo "Starting vLLM Server on DGX Spark"
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
|
echo "Model: $MODEL"
|
||||||
|
echo "Port: $PORT"
|
||||||
|
echo "Log file: $LOG_FILE"
|
||||||
|
echo "PID file: $PID_FILE"
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
|
|
||||||
|
# Start server in background
|
||||||
|
cd "$VLLM_DIR"
|
||||||
|
nohup python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port "$PORT" \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
> "$LOG_FILE" 2>&1 &
|
||||||
|
|
||||||
|
# Save PID
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
echo "OK: Server started with PID: $(cat $PID_FILE)"
|
||||||
|
echo "OK: Waiting for server to be ready..."
|
||||||
|
|
||||||
|
# Wait for server to be ready
|
||||||
|
sleep 5
|
||||||
|
if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server is running!"
|
||||||
|
echo ""
|
||||||
|
echo "Test with: curl http://localhost:$PORT/v1/models"
|
||||||
|
echo "View logs: tail -f $LOG_FILE"
|
||||||
|
echo "Stop server: ./vllm-stop.sh"
|
||||||
|
else
|
||||||
|
echo "ERROR: Server failed to start. Check logs: $LOG_FILE"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
45
scripts/vllm-status.sh
Normal file
45
scripts/vllm-status.sh
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Status Script for DGX Spark
|
||||||
|
|
||||||
|
# Determine installation directory (where this script is located)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
LOG_FILE="$SCRIPT_DIR/vllm-server.log"
|
||||||
|
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
|
echo "vLLM Server Status on DGX Spark"
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
echo "Status: NOT RUNNING (no PID file found)"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "Status: NOT RUNNING (stale PID file)"
|
||||||
|
echo "Cleaning up PID file..."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Status: RUNNING"
|
||||||
|
echo "PID: $PID"
|
||||||
|
echo "Started: $(ps -p $PID -o lstart= 2>/dev/null || echo 'Unknown')"
|
||||||
|
echo "CPU: $(ps -p $PID -o %cpu= 2>/dev/null || echo 'N/A')%"
|
||||||
|
echo "Memory: $(ps -p $PID -o %mem= 2>/dev/null || echo 'N/A')%"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check if log file exists and show last few lines
|
||||||
|
if [ -f "$LOG_FILE" ]; then
|
||||||
|
echo "Recent log entries (last 10 lines):"
|
||||||
|
echo "-" | tr '-' '-' | head -c 70 && echo
|
||||||
|
tail -n 10 "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "Log file not found: $LOG_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=" | tr '=' '-' | head -c 70 && echo
|
||||||
47
scripts/vllm-stop.sh
Normal file
47
scripts/vllm-stop.sh
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# vLLM Server Stop Script for DGX Spark
|
||||||
|
|
||||||
|
# Determine installation directory (where this script is located)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
PID_FILE="$SCRIPT_DIR/.vllm-server.pid"
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
echo "No vLLM server PID file found. Server may not be running."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "vLLM server (PID: $PID) is not running. Cleaning up PID file."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Stopping vLLM server (PID: $PID)..."
|
||||||
|
kill $PID
|
||||||
|
|
||||||
|
# Wait for process to terminate
|
||||||
|
for i in {1..10}; do
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server stopped successfully"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Force kill if still running
|
||||||
|
if ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "Server did not stop gracefully. Force killing..."
|
||||||
|
kill -9 $PID
|
||||||
|
sleep 1
|
||||||
|
if ! ps -p $PID > /dev/null 2>&1; then
|
||||||
|
echo "OK: Server force stopped"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
else
|
||||||
|
echo "ERROR: Failed to stop server"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user