Files
SparkyUI/docker-compose.yml
T
TBNilles 0b606721dd feat(model-manager): run container as host UID/GID
Downloads previously landed in models/ owned by root because the
container ran as root. Add `user: "${PUID:-1000}:${PGID:-1000}"` to the
model-manager service and PUID/PGID to .env.example so downloaded models
are owned by the host user. Defaults to 1000:1000.

Note: existing root-owned files under models/ and sparkyui-data/ must be
chowned once (e.g. via a one-off root container) when upgrading.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 14:55:57 -04:00

177 lines
5.7 KiB
YAML

services:
comfyui:
build:
context: .
dockerfile: Dockerfile
args:
# Pin ComfyUI to a known-good commit/tag if desired
COMFYUI_REF: "${COMFYUI_REF:-master}"
# SageAttention ref (e.g., "main", "v2.2.0", or specific commit)
SAGEATTN_REF: "${SAGEATTN_REF:-main}"
image: sparkyui:cu130
container_name: comfyui
# GPU enablement
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# LAN exposure
ports:
- "${COMFYUI_PORT:-8188}:8188"
environment:
COMFYUI_PORT: "${COMFYUI_PORT:-8188}"
# Optimized for Grace-Blackwell unified memory architecture
# Key insight: DON'T use --gpu-only - let the unified memory fabric work naturally
COMFYUI_FLAGS: "${COMFYUI_FLAGS:---listen 0.0.0.0 --port 8188 --disable-pinned-memory --dont-upcast-attention}"
NVIDIA_VISIBLE_DEVICES: "all"
NVIDIA_DRIVER_CAPABILITIES: "compute,utility"
# Disable torch.compile/inductor - Triton doesn't support Blackwell sm_121a yet
TORCH_COMPILE_DISABLE: "1"
TORCHDYNAMO_DISABLE: "1"
# Grace-Blackwell unified memory — removed aggressive CUDA tuning (5/21):
# CUDA_CACHE_DISABLE, CUDA_DEVICE_MAX_CONNECTIONS, CUDA_DEVICE_MAX_COPY_CONNECTIONS,
# CUDA_MODULE_LOADING=EAGER, CUDA_MANAGED_FORCE_DEVICE_ALLOC, OMP_NUM_THREADS
# These were over-tuning. The ComfyUI flags + Sparky patch handle the architecture.
# Keeping only CUBLAS_WORKSPACE_CONFIG for determinism.
CUBLAS_WORKSPACE_CONFIG: ":0:0"
# CUDA kernel caching — PTX→SASS compilation cache for GB10 (sm_121)
# First run compiles kernels, subsequent runs reuse from disk. 3x speedup reported.
# 4GB cache covers all typical ComfyUI kernel variants.
CUDA_CACHE_MAXSIZE: "4294967296"
volumes:
# Models from existing ComfyUI install (read-only).
# Defaults to the project root; the model-manager service writes here.
- ${COMFYUI_HOST_PATH:-.}/models:/opt/ComfyUI/models:ro
# Custom nodes - comment out to use container-only (fresh) custom_nodes
# If mounted, ComfyUI-Manager installs persist across container restarts
- ${SPARKYUI_DATA_PATH:-.}/custom_nodes:/opt/ComfyUI/custom_nodes
# Outputs/inputs/workflows - persistent across restarts
- ${SPARKYUI_DATA_PATH:-.}/output:/opt/ComfyUI/output
- ${SPARKYUI_DATA_PATH:-.}/input:/opt/ComfyUI/input
- ${SPARKYUI_DATA_PATH:-.}/workflows:/opt/ComfyUI/workflows
# Wheel cache (optional - for prebuilt wheels)
- ${SPARKYUI_DATA_PATH:-.}/wheels:/opt/wheels
# Sparky patches - Grace-Blackwell unified memory optimizations
# model_management.py: HIGH_VRAM→NORMAL_VRAM, intermediate_device()→cuda, soft_empty_cache skip,
# 95% vram_for_weights, UNIFIED_MEMORY detection, offload devices → cuda
# utils.py: copy=False on tensor.to(device) — avoids double-allocation on unified memory
# where CPU and GPU share the same physical RAM (ComfyUI issue #10896)
- ./patches/model_management.py:/opt/ComfyUI/comfy/model_management.py:ro
- ./patches/utils.py:/opt/ComfyUI/comfy/utils.py:ro
networks:
- sparky_net
# Health check - ComfyUI takes time to load, so generous start period
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8188/"]
interval: 30s
timeout: 10s
start_period: 120s
retries: 3
restart: unless-stopped
# ComfyUIMini - Mobile-friendly UI
# Access at http://<host>:3000
comfyuimini:
build:
context: ./comfyuimini
dockerfile: Dockerfile
args:
COMFYUIMINI_REF: "${COMFYUIMINI_REF:-main}"
image: comfyuimini:latest
container_name: comfyuimini
ports:
- "${COMFYUIMINI_PORT:-3000}:3000"
environment:
# node-config override - connects to comfyui container via docker network
NODE_CONFIG: >-
{
"app_port": 3000,
"comfyui_url": "http://comfyui:8188",
"comfyui_ws_url": "ws://comfyui:8188",
"output_dir": "/shared/output",
"reject_unauthorised_cert": false
}
volumes:
# Share output directory with ComfyUI for gallery feature (read-only)
- ${SPARKYUI_DATA_PATH:-.}/output:/shared/output:ro
# Persist server-side workflows
- comfyuimini_workflows:/app/workflows
networks:
- sparky_net
depends_on:
comfyui:
condition: service_healthy
restart: unless-stopped
# Model Manager - StabilityMatrix-style model download/management UI
# Access at http://<host>:8189
model-manager:
build:
context: ./model-manager
dockerfile: Dockerfile
image: sparkyui-model-manager:latest
container_name: model-manager
# Run as the host user so downloaded models are owned by you, not root.
# Defaults to 1000:1000; override via PUID/PGID in .env if needed.
user: "${PUID:-1000}:${PGID:-1000}"
ports:
- "${MODEL_MANAGER_PORT:-8189}:8189"
environment:
MODELS_DIR: /models
DATA_DIR: /data
volumes:
# Shared models dir - read-WRITE here so downloads land on the host.
# ComfyUI mounts the same host folder read-only and picks up new files.
- ${COMFYUI_HOST_PATH:-.}/models:/models
# Persistent SQLite DB (sources, API keys, download history)
- ${SPARKYUI_DATA_PATH:-.}/sparkyui-data:/data
networks:
- sparky_net
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8189/api/model-types"]
interval: 30s
timeout: 10s
start_period: 15s
retries: 3
restart: unless-stopped
networks:
sparky_net:
driver: bridge
volumes:
comfyuimini_workflows: