mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-04-18 18:33:30 +00:00

* Allow disabling pe in flux code for some other models. * Initial Hunyuan3Dv2 implementation. Supports the multiview, mini, turbo models and VAEs. * Fix orientation of hunyuan 3d model. * A few fixes for the hunyuan3d models. * Update frontend to 1.13 (#7331) * Add backend primitive nodes (#7328) * Add backend primitive nodes * Add control after generate to int primitive * Nodes to convert images to YUV and back. Can be used to convert an image to black and white. * Update frontend to 1.14 (#7343) * Native LotusD Implementation (#7125) * draft pass at a native comfy implementation of Lotus-D depth and normal est * fix model_sampling kludges * fix ruff --------- Co-authored-by: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> * Automatically set the right sampling type for lotus. * support output normal and lineart once (#7290) * [nit] Format error strings (#7345) * ComfyUI version v0.3.27 * Fallback to pytorch attention if sage attention fails. * Add model merging node for WAN 2.1 * Add Hunyuan3D to readme. * Support more float8 types. * Add CFGZeroStar node. Works on all models that use a negative prompt but is meant for rectified flow models. * Support the WAN 2.1 fun control models. Use the new WanFunControlToVideo node. * Add WanFunInpaintToVideo node for the Wan fun inpaint models. * Update frontend to 1.14.6 (#7416) Cherry-pick the fix: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3252 * Don't error if wan concat image has extra channels. * ltxv: fix preprocessing exception when compression is 0. (#7431) * Remove useless code. * Fix latent composite node not working when source has alpha. * Fix alpha channel mismatch on destination in ImageCompositeMasked * Add option to store TE in bf16 (#7461) * User missing (#7439) * Ensuring a 401 error is returned when user data is not found in multi-user context. * Returning a 401 error when provided comfy-user does not exists on server side. * Fix comment. This function does not support quads. * MLU memory optimization (#7470) Co-authored-by: huzhan <huzhan@cambricon.com> * Fix alpha image issue in more nodes. * Fix problem. * Disable partial offloading of audio VAE. * Add activations_shape info in UNet models (#7482) * Add activations_shape info in UNet models * activations_shape should be a list * Support 512 siglip model. * Show a proper error to the user when a vision model file is invalid. * Support the wan fun reward loras. --------- Co-authored-by: comfyanonymous <comfyanonymous@protonmail.com> Co-authored-by: Chenlei Hu <hcl@comfy.org> Co-authored-by: thot experiment <94414189+thot-experiment@users.noreply.github.com> Co-authored-by: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Co-authored-by: Terry Jia <terryjia88@gmail.com> Co-authored-by: Michael Kupchick <michael@lightricks.com> Co-authored-by: BVH <82035780+bvhari@users.noreply.github.com> Co-authored-by: Laurent Erignoux <lerignoux@gmail.com> Co-authored-by: BiologicalExplosion <49753622+BiologicalExplosion@users.noreply.github.com> Co-authored-by: huzhan <huzhan@cambricon.com> Co-authored-by: Raphael Walker <slickytail.mc@gmail.com>
160 lines
8.7 KiB
Python
160 lines
8.7 KiB
Python
import nodes
|
|
import node_helpers
|
|
import torch
|
|
import comfy.model_management
|
|
import comfy.utils
|
|
import comfy.latent_formats
|
|
|
|
|
|
class WanImageToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5
|
|
image[:start_image.shape[0]] = start_image
|
|
|
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
|
mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
|
|
mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
|
|
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
|
|
class WanFunControlToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
"control_video": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
|
|
concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
|
|
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
concat_latent_image = vae.encode(start_image[:, :, :, :3])
|
|
concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
|
|
|
if control_video is not None:
|
|
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
concat_latent_image = vae.encode(control_video[:, :, :, :3])
|
|
concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
|
|
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
class WanFunInpaintToVideo:
|
|
@classmethod
|
|
def INPUT_TYPES(s):
|
|
return {"required": {"positive": ("CONDITIONING", ),
|
|
"negative": ("CONDITIONING", ),
|
|
"vae": ("VAE", ),
|
|
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
|
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
|
},
|
|
"optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
|
|
"start_image": ("IMAGE", ),
|
|
"end_image": ("IMAGE", ),
|
|
}}
|
|
|
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
|
RETURN_NAMES = ("positive", "negative", "latent")
|
|
FUNCTION = "encode"
|
|
|
|
CATEGORY = "conditioning/video_models"
|
|
|
|
def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None):
|
|
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
|
if start_image is not None:
|
|
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
if end_image is not None:
|
|
end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
|
|
|
image = torch.ones((length, height, width, 3)) * 0.5
|
|
mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
|
|
|
|
if start_image is not None:
|
|
image[:start_image.shape[0]] = start_image
|
|
mask[:, :, :start_image.shape[0] + 3] = 0.0
|
|
|
|
if end_image is not None:
|
|
image[-end_image.shape[0]:] = end_image
|
|
mask[:, :, -end_image.shape[0]:] = 0.0
|
|
|
|
concat_latent_image = vae.encode(image[:, :, :, :3])
|
|
mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
|
|
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
|
|
|
|
if clip_vision_output is not None:
|
|
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
|
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
|
|
|
out_latent = {}
|
|
out_latent["samples"] = latent
|
|
return (positive, negative, out_latent)
|
|
|
|
NODE_CLASS_MAPPINGS = {
|
|
"WanImageToVideo": WanImageToVideo,
|
|
"WanFunControlToVideo": WanFunControlToVideo,
|
|
"WanFunInpaintToVideo": WanFunInpaintToVideo,
|
|
}
|