From 31939a9710f17f9f1d278b0e8ca78d8949c5cd59 Mon Sep 17 00:00:00 2001 From: Evan Carmen Date: Wed, 20 May 2026 19:30:53 -0500 Subject: [PATCH] fix: revert intermediate_device to cpu for unified memory intermediate_device() controls where large output tensors (decoded video frames) are accumulated. On unified memory, cpu and cuda:0 share the same physical RAM, but the CUDA allocator has different fragmentation behavior. With intermediate_device=cuda:0, LTX video VAE decode hung because tiled_scale_multidim allocates the full output tensor on cuda:0 upfront, and the CUDA allocator can't efficiently reclaim space during tiled decode. Reverting to cpu fixes the hang. vae_offload_device() and text_encoder_offload_device() remain cuda:0 since those model-loading paths benefit from GPU allocation. --- patches/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patches/model_management.py b/patches/model_management.py index ec9edb5..43b95be 100644 --- a/patches/model_management.py +++ b/patches/model_management.py @@ -1106,7 +1106,7 @@ def text_encoder_dtype(device=None): def intermediate_device(): - if args.gpu_only or UNIFIED_MEMORY: + if args.gpu_only: return get_torch_device() else: return torch.device("cpu")