mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-03-15 14:09:36 +00:00
112 lines
4.9 KiB
Python
112 lines
4.9 KiB
Python
import nodes
|
|
import node_helpers
|
|
import torch
|
|
import comfy.model_management
|
|
|
|
|
|
class CLIPTextEncodeHunyuanDiT:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {
|
|
"clip": ("CLIP", ),
|
|
"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
|
"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
|
}}
|
|
RETURN_TYPES = ("CONDITIONING",)
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "advanced/conditioning"
|
|
|
|
def encode(self, clip, bert, mt5xl):
|
|
tokens = clip.tokenize(bert)
|
|
tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
|
|
|
|
return (clip.encode_from_tokens_scheduled(tokens), )
|
|
|
|
class EmptyHunyuanLatentVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
|
|
RETURN_TYPES = ("LATENT",)
|
|
FUNCTION = "generate"
|
|
|
|
CATEGORY = "latent/video"
|
|
|
|
def generate(self, width, height, length, batch_size=1):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
return ({"samples":latent}, )
|
|
|
|
PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
|
|
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
|
|
"1. The main content and theme of the video."
|
|
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
|
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
|
"4. background environment, light, style and atmosphere."
|
|
"5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
|
|
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
|
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
|
)
|
|
|
|
class TextEncodeHunyuanVideo_ImageToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {
|
|
"clip": ("CLIP", ),
|
|
"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
|
|
}}
|
|
RETURN_TYPES = ("CONDITIONING",)
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "advanced/conditioning"
|
|
|
|
def encode(self, clip, clip_vision_output, prompt):
|
|
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected)
|
|
return (clip.encode_from_tokens_scheduled(tokens), )
|
|
|
|
|
|
class HunyuanImageToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"start_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, vae, width, height, length, batch_size, start_image=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
|
|
concat_latent_image = vae.encode(start_image)
|
|
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
|
|
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
|
|
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, out_latent)
|
|
|
|
|
|
NODE_CLASS_MAPPINGS = {
|
|
"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
|
|
"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
|
|
"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
|
|
"HunyuanImageToVideo": HunyuanImageToVideo,
|
|
}
|