mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-06-05 02:52:09 +08:00
309 lines
16 KiB
Python
309 lines
16 KiB
Python
import nodes
|
|
import node_helpers
|
|
import torch
|
|
import comfy.model_management
|
|
import comfy.utils
|
|
import comfy.latent_formats
|
|
import comfy.clip_vision
|
|
|
|
|
|
class WanImageToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5
|
|
image[:start_image.shape[0]] = start_image
|
|
|
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
|
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
|
|
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
|
|
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
|
|
class WanFunControlToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
"control_video": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
|
|
concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
|
|
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
concat_latent_image = vae.encode(start_image[:, :, :, :3])
|
|
concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
|
|
|
if control_video is not None:
|
|
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
concat_latent_image = vae.encode(control_video[:, :, :, :3])
|
|
concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
|
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
class WanFirstLastFrameToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_start_image": ("CLIP_VISION_OUTPUT", ),
|
|
"clip_vision_end_image": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
"end_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
if end_image is not None:
|
|
end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
|
|
image = torch.ones((length, height, width, 3)) * 0.5
|
|
mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
|
|
|
|
if start_image is not None:
|
|
image[:start_image.shape[0]] = start_image
|
|
mask[:, :, :start_image.shape[0] + 3] = 0.0
|
|
|
|
if end_image is not None:
|
|
image[-end_image.shape[0]:] = end_image
|
|
mask[:, :, -end_image.shape[0]:] = 0.0
|
|
|
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
|
mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
|
|
if clip_vision_start_image is not None:
|
|
clip_vision_output = clip_vision_start_image
|
|
|
|
if clip_vision_end_image is not None:
|
|
if clip_vision_output is not None:
|
|
states = torch.cat([clip_vision_output.penultimate_hidden_states, clip_vision_end_image.penultimate_hidden_states], dim=-2)
|
|
clip_vision_output = comfy.clip_vision.Output()
|
|
clip_vision_output.penultimate_hidden_states = states
|
|
else:
|
|
clip_vision_output = clip_vision_end_image
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
|
|
class WanFunInpaintToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
"end_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None):
|
|
flfv = WanFirstLastFrameToVideo()
|
|
return flfv.encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output)
|
|
|
|
|
|
class WanVaceToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1000.0, "step": 0.01}),
|
|
},
|
|
"optional": {"control_video": ("IMAGE", ),
|
|
"control_masks": ("MASK", ),
|
|
"reference_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT", "INT")
|
|
RETURN_NAMES = ("positive", "negative", "latent", "trim_latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
EXPERIMENTAL = True
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None):
|
|
latent_length = ((length - 1) // 4) + 1
|
|
if control_video is not None:
|
|
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
if control_video.shape[0] < length:
|
|
control_video = torch.nn.functional.pad(control_video, (0, 0, 0, 0, 0, 0, 0, length - control_video.shape[0]), value=0.5)
|
|
else:
|
|
control_video = torch.ones((length, height, width, 3)) * 0.5
|
|
|
|
if reference_image is not None:
|
|
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
reference_image = vae.encode(reference_image[:, :, :, :3])
|
|
reference_image = torch.cat([reference_image, comfy.latent_formats.Wan21().process_out(torch.zeros_like(reference_image))], dim=1)
|
|
|
|
if control_masks is None:
|
|
mask = torch.ones((length, height, width, 1))
|
|
else:
|
|
mask = control_masks
|
|
if mask.ndim == 3:
|
|
mask = mask.unsqueeze(1)
|
|
mask = comfy.utils.common_upscale(mask[:length], width, height, "bilinear", "center").movedim(1, -1)
|
|
if mask.shape[0] < length:
|
|
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, 0, 0, length - mask.shape[0]), value=1.0)
|
|
|
|
control_video = control_video - 0.5
|
|
inactive = (control_video * (1 - mask)) + 0.5
|
|
reactive = (control_video * mask) + 0.5
|
|
|
|
inactive = vae.encode(inactive[:, :, :, :3])
|
|
reactive = vae.encode(reactive[:, :, :, :3])
|
|
control_video_latent = torch.cat((inactive, reactive), dim=1)
|
|
if reference_image is not None:
|
|
control_video_latent = torch.cat((reference_image, control_video_latent), dim=2)
|
|
|
|
vae_stride = 8
|
|
height_mask = height // vae_stride
|
|
width_mask = width // vae_stride
|
|
mask = mask.view(length, height_mask, vae_stride, width_mask, vae_stride)
|
|
mask = mask.permute(2, 4, 0, 1, 3)
|
|
mask = mask.reshape(vae_stride * vae_stride, length, height_mask, width_mask)
|
|
mask = torch.nn.functional.interpolate(mask.unsqueeze(0), size=(latent_length, height_mask, width_mask), mode='nearest-exact').squeeze(0)
|
|
|
|
trim_latent = 0
|
|
if reference_image is not None:
|
|
mask_pad = torch.zeros_like(mask[:, :reference_image.shape[2], :, :])
|
|
mask = torch.cat((mask_pad, mask), dim=1)
|
|
latent_length += reference_image.shape[2]
|
|
trim_latent = reference_image.shape[2]
|
|
|
|
mask = mask.unsqueeze(0)
|
|
positive = node_helpers.conditioning_set_values(positive, {"vace_frames": control_video_latent, "vace_mask": mask, "vace_strength": strength})
|
|
negative = node_helpers.conditioning_set_values(negative, {"vace_frames": control_video_latent, "vace_mask": mask, "vace_strength": strength})
|
|
|
|
latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent, trim_latent)
|
|
|
|
class TrimVideoLatent:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": { "samples": ("LATENT",),
|
|
"trim_amount": ("INT", {"default": 0, "min": 0, "max": 99999}),
|
|
}}
|
|
|
|
RETURN_TYPES = ("LATENT",)
|
|
FUNCTION = "op"
|
|
|
|
CATEGORY = "latent/video"
|
|
|
|
EXPERIMENTAL = True
|
|
|
|
def op(self, samples, trim_amount):
|
|
samples_out = samples.copy()
|
|
|
|
s1 = samples["samples"]
|
|
samples_out["samples"] = s1[:, :, trim_amount:]
|
|
return (samples_out,)
|
|
|
|
|
|
NODE_CLASS_MAPPINGS = {
|
|
"WanImageToVideo": WanImageToVideo,
|
|
"WanFunControlToVideo": WanFunControlToVideo,
|
|
"WanFunInpaintToVideo": WanFunInpaintToVideo,
|
|
"WanFirstLastFrameToVideo": WanFirstLastFrameToVideo,
|
|
"WanVaceToVideo": WanVaceToVideo,
|
|
"TrimVideoLatent": TrimVideoLatent,
|
|
}
|