From 7d9f51753bca47dd1d0d80da48f7764214cb0042 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Tue, 18 Feb 2025 17:02:11 +0200 Subject: [PATCH] VAE scaling --- comfy/model_base.py | 2 +- comfy/model_detection.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 185aa825..601c2349 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -879,7 +879,7 @@ class HunyuanVideo(BaseModel): else: padding_shape = (noise.shape[0], 16, noise.shape[2] - 1, noise.shape[3], noise.shape[4]) latent_padding = torch.zeros(padding_shape, device=noise.device, dtype=noise.dtype) - image_latents = torch.cat([image.to(noise), latent_padding], dim=2) + image_latents = torch.cat([image.to(noise), latent_padding], dim=2) * 0.476986 process_image_in = lambda image: image out['c_concat'] = comfy.conds.CONDNoiseShape(process_image_in(image_latents)) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 038c1ee7..5051f821 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -136,7 +136,7 @@ def detect_unet_config(state_dict, key_prefix): if '{}txt_in.individual_token_refiner.blocks.0.norm1.weight'.format(key_prefix) in state_dict_keys: #Hunyuan Video dit_config = {} dit_config["image_model"] = "hunyuan_video" - dit_config["in_channels"] = state_dict["img_in.proj.weight"].shape[1] #SkyReels img2video 32 has input channels + dit_config["in_channels"] = state_dict["img_in.proj.weight"].shape[1] #SkyReels img2video has 32 input channels dit_config["patch_size"] = [1, 2, 2] dit_config["out_channels"] = 16 dit_config["vec_in_dim"] = 768