Merge 1ffed0f41cd733237f409afaba0a9ca40c8b50b5 into aee2908d0395577a6e2e13d1307aaf271424108b

Remove useless log. (#8166 )
Make ImagePadForOutpaint return a 3 channel mask. (#8157 )
2025-06-02 01:22:11 +08:00 · 2025-05-17 23:11:03 -04:00 · 2025-05-17 06:27:34 -04:00 · 2025-05-16 15:15:55 -04:00 · 2025-05-16 10:45:36 -07:00 · 2025-05-15 19:02:19 -04:00
15 changed files with 733 additions and 44 deletions
--- a/README.md
+++ b/README.md
@ -110,7 +110,6 @@ ComfyUI follows a weekly release cycle every Friday, with three interconnected r

 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
   - Builds a new release using the latest stable core version
-   - Version numbers match the core release (e.g., Desktop v1.7.0 uses Core v1.7.0)

 3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
   - Weekly frontend updates are merged into the core repository
@ -302,7 +301,7 @@ For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 pyt

 ### AMD ROCm Tips

-You can enable experimental memory efficient attention on pytorch 2.5 in ComfyUI on RDNA3 and potentially other AMD GPUs using this command:
+You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.

 ```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```

--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -247,6 +247,60 @@ class VaceWanAttentionBlock(WanAttentionBlock):
        return c_skip, c


+class WanCamAdapter(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_blocks=1, operation_settings={}):
+        super(WanCamAdapter, self).__init__()
+
+        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
+        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=8)
+
+        # Convolution: reduce spatial dimensions by a factor
+        #  of 2 (without overlap)
+        self.conv = operation_settings.get("operations").Conv2d(in_dim * 64, out_dim, kernel_size=kernel_size, stride=stride, padding=0, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+        # Residual blocks for feature extraction
+        self.residual_blocks = nn.Sequential(
+            *[WanCamResidualBlock(out_dim, operation_settings = operation_settings) for _ in range(num_residual_blocks)]
+        )
+
+    def forward(self, x):
+        # Reshape to merge the frame dimension into batch
+        bs, c, f, h, w = x.size()
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
+
+        # Pixel Unshuffle operation
+        x_unshuffled = self.pixel_unshuffle(x)
+
+        # Convolution operation
+        x_conv = self.conv(x_unshuffled)
+
+        # Feature extraction with residual blocks
+        out = self.residual_blocks(x_conv)
+
+        # Reshape to restore original bf dimension
+        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
+
+        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
+        out = out.permute(0, 2, 1, 3, 4)
+
+        return out
+
+
+class WanCamResidualBlock(nn.Module):
+    def __init__(self, dim, operation_settings={}):
+        super(WanCamResidualBlock, self).__init__()
+        self.conv1 = operation_settings.get("operations").Conv2d(dim, dim, kernel_size=3, padding=1, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = operation_settings.get("operations").Conv2d(dim, dim, kernel_size=3, padding=1, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.conv1(x))
+        out = self.conv2(out)
+        out += residual
+        return out
+
+
 class Head(nn.Module):

    def __init__(self, dim, out_dim, patch_size, eps=1e-6, operation_settings={}):
@ -637,3 +691,92 @@ class VaceWanModel(WanModel):
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        return x
+
+class CameraWanModel(WanModel):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    def __init__(self,
+                 model_type='camera',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 flf_pos_embed_token_number=None,
+                 image_model=None,
+                 in_dim_control_adapter=24,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+
+        super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        self.control_adapter = WanCamAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], operation_settings=operation_settings)
+
+
+    def forward_orig(
+        self,
+        x,
+        t,
+        context,
+        clip_fea=None,
+        freqs=None,
+        camera_conditions = None,
+        transformer_options={},
+        **kwargs,
+    ):
+        # embeddings
+        x = self.patch_embedding(x.float()).to(x.dtype)
+        if self.control_adapter is not None and camera_conditions is not None:
+            x_camera = self.control_adapter(camera_conditions).to(x.dtype)
+            x = x + x_camera
+        grid_sizes = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+
+        # context
+        context = self.text_embedding(context)
+
+        context_img_len = None
+        if clip_fea is not None:
+            if self.img_emb is not None:
+                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+                context = torch.concat([context_clip, context], dim=1)
+            context_img_len = clip_fea.shape[-2]
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+
+        # head
+        x = self.head(x, e)
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1079,6 +1079,17 @@ class WAN21_Vace(WAN21):
        out['vace_strength'] = comfy.conds.CONDConstant(vace_strength)
        return out

+class WAN21_Camera(WAN21):
+    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
+        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.CameraWanModel)
+        self.image_to_video = image_to_video
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        camera_conditions = kwargs.get("camera_conditions", None)
+        if camera_conditions is not None:
+            out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions)
+        return out

 class Hunyuan3Dv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -361,6 +361,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["model_type"] = "vace"
            dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1]
            dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.')
+        elif '{}control_adapter.conv.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["model_type"] = "camera"
        else:
            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
                dit_config["model_type"] = "i2v"
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -479,6 +479,15 @@ class VAE:
        self.first_stage_model.to(self.vae_dtype)
        self.output_device = model_management.intermediate_device()

+        self.png_chunks = {}
+
+        if metadata is not None:
+            meta_color_space = metadata.get("modelspec.color_space")
+            if str(meta_color_space).lower().startswith("cicp:"):
+                cicp_chunk = meta_color_space.split("cicp:")[-1].split(",")
+                cicp_chunk = bytes([1 if b.lower() == 'true' else 0 if b.lower() == 'false' else int(b) for b in cicp_chunk])
+                self.png_chunks[b"cICP"] = cicp_chunk
+
        self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
        logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))

--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -992,6 +992,16 @@ class WAN21_FunControl2V(WAN21_T2V):
        out = model_base.WAN21(self, image_to_video=False, device=device)
        return out

+class WAN21_Camera(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "camera",
+        "in_dim": 32,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
+        return out
 class WAN21_Vace(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
@ -1129,6 +1139,6 @@ class ACEStep(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)

-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep]

 models += [SVD_img2vid]
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -78,8 +78,6 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
            pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args)
        else:
            pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle)
-        if "global_step" in pl_sd:
-            logging.debug(f"Global Step: {pl_sd['global_step']}")
        if "state_dict" in pl_sd:
            sd = pl_sd["state_dict"]
        else:
--- a/comfy_api/input/video_types.py
+++ b/comfy_api/input/video_types.py
@ -43,3 +43,13 @@ class VideoInput(ABC):
        components = self.get_components()
        return components.images.shape[2], components.images.shape[1]

+    def get_duration(self) -> float:
+        """
+        Returns the duration of the video in seconds.
+
+        Returns:
+            Duration in seconds
+        """
+        components = self.get_components()
+        frame_count = components.images.shape[0]
+        return float(frame_count / components.frame_rate)
--- a/comfy_api/input_impl/video_types.py
+++ b/comfy_api/input_impl/video_types.py
@ -80,6 +80,38 @@ class VideoFromFile(VideoInput):
                    return stream.width, stream.height
        raise ValueError(f"No video stream found in file '{self.__file}'")

+    def get_duration(self) -> float:
+        """
+        Returns the duration of the video in seconds.
+
+        Returns:
+            Duration in seconds
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+        with av.open(self.__file, mode="r") as container:
+            if container.duration is not None:
+                return float(container.duration / av.time_base)
+
+            # Fallback: calculate from frame count and frame rate
+            video_stream = next(
+                (s for s in container.streams if s.type == "video"), None
+            )
+            if video_stream and video_stream.frames and video_stream.average_rate:
+                return float(video_stream.frames / video_stream.average_rate)
+
+            # Last resort: decode frames to count them
+            if video_stream and video_stream.average_rate:
+                frame_count = 0
+                container.seek(0)
+                for packet in container.demux(video_stream):
+                    for _ in packet.decode():
+                        frame_count += 1
+                if frame_count > 0:
+                    return float(frame_count / video_stream.average_rate)
+
+        raise ValueError(f"Could not determine duration for file '{self.__file}'")
+
    def get_components_internal(self, container: InputContainer) -> VideoComponents:
        # Get video frames
        frames = []
--- a/comfy_extras/nodes_camera_trajectory.py
+++ b/comfy_extras/nodes_camera_trajectory.py
@ -0,0 +1,218 @@
+import nodes
+import torch
+import numpy as np
+from einops import rearrange
+import comfy.model_management
+
+
+
+MAX_RESOLUTION = nodes.MAX_RESOLUTION
+
+CAMERA_DICT = {
+    "base_T_norm": 1.5,
+    "base_angle": np.pi/3,
+    "Static": {     "angle":[0., 0., 0.],   "T":[0., 0., 0.]},
+    "Pan Up": {     "angle":[0., 0., 0.],   "T":[0., -1., 0.]},
+    "Pan Down": {   "angle":[0., 0., 0.],   "T":[0.,1.,0.]},
+    "Pan Left": {   "angle":[0., 0., 0.],   "T":[-1.,0.,0.]},
+    "Pan Right": {  "angle":[0., 0., 0.],   "T": [1.,0.,0.]},
+    "Zoom In": {    "angle":[0., 0., 0.],   "T": [0.,0.,2.]},
+    "Zoom Out": {   "angle":[0., 0., 0.],   "T": [0.,0.,-2.]},
+    "Anti Clockwise (ACW)": {        "angle": [0., 0., -1.],  "T":[0., 0., 0.]},
+    "ClockWise (CW)": {         "angle": [0., 0., 1.], "T":[0., 0., 0.]},
+}
+
+
+def process_pose_params(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu'):
+
+    def get_relative_pose(cam_params):
+        """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+        """
+        abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+        abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+        cam_to_origin = 0
+        target_cam_c2w = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, -cam_to_origin],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1]
+        ])
+        abs2rel = target_cam_c2w @ abs_w2cs[0]
+        ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+        ret_poses = np.array(ret_poses, dtype=np.float32)
+        return ret_poses
+
+    """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+
+    sample_wh_ratio = width / height
+    pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
+
+    if pose_wh_ratio > sample_wh_ratio:
+        resized_ori_w = height * pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fx = resized_ori_w * cam_param.fx / width
+    else:
+        resized_ori_h = width / pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fy = resized_ori_h * cam_param.fy / height
+
+    intrinsic = np.asarray([[cam_param.fx * width,
+                            cam_param.fy * height,
+                            cam_param.cx * width,
+                            cam_param.cy * height]
+                            for cam_param in cam_params], dtype=np.float32)
+
+    K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+    c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
+    c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+    plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
+    plucker_embedding = plucker_embedding[None]
+    plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
+    return plucker_embedding
+
+class Camera(object):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        c2w_mat = np.array(entry[7:]).reshape(4, 4)
+        self.c2w_mat = c2w_mat
+        self.w2c_mat = np.linalg.inv(c2w_mat)
+
+def ray_condition(K, c2w, H, W, device):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+
+    B = K.shape[0]
+
+    j, i = torch.meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+        indexing='ij'
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+
+def get_camera_motion(angle, T, speed, n=81):
+    def compute_R_form_rad_angle(angles):
+        theta_x, theta_y, theta_z = angles
+        Rx = np.array([[1, 0, 0],
+                    [0, np.cos(theta_x), -np.sin(theta_x)],
+                    [0, np.sin(theta_x), np.cos(theta_x)]])
+
+        Ry = np.array([[np.cos(theta_y), 0, np.sin(theta_y)],
+                    [0, 1, 0],
+                    [-np.sin(theta_y), 0, np.cos(theta_y)]])
+
+        Rz = np.array([[np.cos(theta_z), -np.sin(theta_z), 0],
+                    [np.sin(theta_z), np.cos(theta_z), 0],
+                    [0, 0, 1]])
+
+        R = np.dot(Rz, np.dot(Ry, Rx))
+        return R
+    RT = []
+    for i in range(n):
+        _angle = (i/n)*speed*(CAMERA_DICT["base_angle"])*angle
+        R = compute_R_form_rad_angle(_angle)
+        _T=(i/n)*speed*(CAMERA_DICT["base_T_norm"])*(T.reshape(3,1))
+        _RT = np.concatenate([R,_T], axis=1)
+        RT.append(_RT)
+    RT = np.stack(RT)
+    return RT
+
+class WanCameraEmbedding:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "camera_pose":(["Static","Pan Up","Pan Down","Pan Left","Pan Right","Zoom In","Zoom Out","Anti Clockwise (ACW)", "ClockWise (CW)"],{"default":"Static"}),
+                "width": ("INT", {"default": 832, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
+                "height": ("INT", {"default": 480, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
+                "length": ("INT", {"default": 81, "min": 1, "max": MAX_RESOLUTION, "step": 4}),
+            },
+            "optional":{
+                "speed":("FLOAT",{"default":1.0, "min": 0, "max": 10.0, "step": 0.1}),
+                "fx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
+                "fy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
+                "cx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
+                "cy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
+            }
+
+        }
+
+    RETURN_TYPES = ("WAN_CAMERA_EMBEDDING","INT","INT","INT")
+    RETURN_NAMES = ("camera_embedding","width","height","length")
+    FUNCTION = "run"
+    CATEGORY = "camera"
+
+    def run(self, camera_pose, width, height, length, speed=1.0,  fx=0.5, fy=0.5, cx=0.5, cy=0.5):
+        """
+        Use Camera trajectory as extrinsic parameters to calculate Plücker embeddings (Sitzmannet al., 2021)
+        Adapted from https://github.com/aigc-apps/VideoX-Fun/blob/main/comfyui/comfyui_nodes.py
+        """
+        motion_list = [camera_pose]
+        speed = speed
+        angle = np.array(CAMERA_DICT[motion_list[0]]["angle"])
+        T = np.array(CAMERA_DICT[motion_list[0]]["T"])
+        RT = get_camera_motion(angle, T, speed, length)
+
+        trajs=[]
+        for cp in RT.tolist():
+            traj=[fx,fy,cx,cy,0,0]
+            traj.extend(cp[0])
+            traj.extend(cp[1])
+            traj.extend(cp[2])
+            traj.extend([0,0,0,1])
+            trajs.append(traj)
+
+        cam_params = np.array([[float(x) for x in pose] for pose in trajs])
+        cam_params = np.concatenate([np.zeros_like(cam_params[:, :1]), cam_params], 1)
+        control_camera_video = process_pose_params(cam_params, width=width, height=height)
+        control_camera_video = control_camera_video.permute([3, 0, 1, 2]).unsqueeze(0).to(device=comfy.model_management.intermediate_device())
+
+        control_camera_video = torch.concat(
+            [
+                torch.repeat_interleave(control_camera_video[:, :, 0:1], repeats=4, dim=2),
+                control_camera_video[:, :, 1:]
+            ], dim=2
+        ).transpose(1, 2)
+
+        # Reshape, transpose, and view into desired shape
+        b, f, c, h, w = control_camera_video.shape
+        control_camera_video = control_camera_video.contiguous().view(b, f // 4, 4, c, h, w).transpose(2, 3)
+        control_camera_video = control_camera_video.contiguous().view(b, f // 4, c * 4, h, w).transpose(1, 2)
+
+        return (control_camera_video, width, height, length)
+
+
+NODE_CLASS_MAPPINGS = {
+    "WanCameraEmbedding": WanCameraEmbedding,
+}
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -297,6 +297,52 @@ class TrimVideoLatent:
        samples_out["samples"] = s1[:, :, trim_amount:]
        return (samples_out,)

+class WanCameraImageToVideo:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "vae": ("VAE", ),
+                             "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                },
+                "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
+                             "start_image": ("IMAGE", ),
+                             "camera_conditions": ("WAN_CAMERA_EMBEDDING", ),
+                }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/video_models"
+
+    def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None):
+        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+        concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+        concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
+
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            concat_latent_image = vae.encode(start_image[:, :, :, :3])
+            concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
+
+            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent})
+            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent})
+
+        if camera_conditions is not None:
+            positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions})
+            negative = node_helpers.conditioning_set_values(negative, {'camera_conditions': camera_conditions})
+
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return (positive, negative, out_latent)

 NODE_CLASS_MAPPINGS = {
    "WanImageToVideo": WanImageToVideo,
@ -305,4 +351,5 @@ NODE_CLASS_MAPPINGS = {
    "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo,
    "WanVaceToVideo": WanVaceToVideo,
    "TrimVideoLatent": TrimVideoLatent,
+    "WanCameraImageToVideo": WanCameraImageToVideo,
 }
--- a/fix_torch.py
+++ b/fix_torch.py
@ -1,28 +0,0 @@
-import importlib.util
-import shutil
-import os
-import ctypes
-import logging
-
-
-def fix_pytorch_libomp():
-    """
-    Fix PyTorch libomp DLL issue on Windows by copying the correct DLL file if needed.
-    """
-    torch_spec = importlib.util.find_spec("torch")
-    for folder in torch_spec.submodule_search_locations:
-        lib_folder = os.path.join(folder, "lib")
-        test_file = os.path.join(lib_folder, "fbgemm.dll")
-        dest = os.path.join(lib_folder, "libomp140.x86_64.dll")
-        if os.path.exists(dest):
-            break
-
-        with open(test_file, "rb") as f:
-            contents = f.read()
-            if b"libomp140.x86_64.dll" not in contents:
-                break
-        try:
-            ctypes.cdll.LoadLibrary(test_file)
-        except FileNotFoundError:
-            logging.warning("Detected pytorch version with libomp issue, patching.")
-            shutil.copyfile(os.path.join(lib_folder, "libiomp5md.dll"), dest)
--- a/main.py
+++ b/main.py
@ -125,13 +125,6 @@ if __name__ == "__main__":

    import cuda_malloc

-if args.windows_standalone_build:
-    try:
-        from fix_torch import fix_pytorch_libomp
-        fix_pytorch_libomp()
-    except:
-        pass
-
 import comfy.utils

 import execution
--- a/nodes.py
+++ b/nodes.py
@ -286,10 +286,12 @@ class VAEDecode:
    CATEGORY = "latent"
    DESCRIPTION = "Decodes latent images back into pixel space images."

-    def decode(self, vae, samples):
+    def decode(self, vae: comfy.sd.VAE, samples):
        images = vae.decode(samples["samples"])
        if len(images.shape) == 5: #Combine batches
            images = images.reshape(-1, images.shape[-3], images.shape[-2], images.shape[-1])
+        if vae.png_chunks is not None:
+            images.png_chunks = vae.png_chunks
        return (images, )

 class VAEDecodeTiled:
@ -772,7 +774,8 @@ class VAELoader:
        else:
            vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
            sd = comfy.utils.load_torch_file(vae_path)
-        vae = comfy.sd.VAE(sd=sd)
+            metadata = json.loads(comfy.utils.safetensors_header(vae_path, max_size=1024*1024) or "{}").get("__metadata__")
+        vae = comfy.sd.VAE(sd=sd, metadata=metadata)
        vae.throw_exception_if_invalid()
        return (vae,)

@ -1600,7 +1603,9 @@ class SaveImage:
                if extra_pnginfo is not None:
                    for x in extra_pnginfo:
                        metadata.add_text(x, json.dumps(extra_pnginfo[x]))
-
+                if hasattr(images, "png_chunks"):
+                    for name, data in images.png_chunks.items():
+                        metadata.add(name, data)
            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
            file = f"{filename_with_batch_num}_{counter:05}_.png"
            img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=self.compress_level)
@ -1940,7 +1945,7 @@ class ImagePadForOutpaint:

        mask[top:top + d2, left:left + d3] = t

-        return (new_image, mask)
+        return (new_image, mask.unsqueeze(0))


 NODE_CLASS_MAPPINGS = {
@ -2265,6 +2270,7 @@ def init_builtin_extra_nodes():
        "nodes_preview_any.py",
        "nodes_ace.py",
        "nodes_string.py",
+        "nodes_camera_trajectory.py",
    ]

    import_failed = []
--- a/tests-unit/comfy_api_test/video_types_test.py
+++ b/tests-unit/comfy_api_test/video_types_test.py
@ -0,0 +1,239 @@
+import pytest
+import torch
+import tempfile
+import os
+import av
+import io
+from fractions import Fraction
+from comfy_api.input_impl.video_types import VideoFromFile, VideoFromComponents
+from comfy_api.util.video_types import VideoComponents
+from comfy_api.input.basic_types import AudioInput
+from av.error import InvalidDataError
+
+EPSILON = 0.0001
+
+
+@pytest.fixture
+def sample_images():
+    """3-frame 2x2 RGB video tensor"""
+    return torch.rand(3, 2, 2, 3)
+
+
+@pytest.fixture
+def sample_audio():
+    """Stereo audio with 44.1kHz sample rate"""
+    return AudioInput(
+        {
+            "waveform": torch.rand(1, 2, 1000),
+            "sample_rate": 44100,
+        }
+    )
+
+
+@pytest.fixture
+def video_components(sample_images, sample_audio):
+    """VideoComponents with images, audio, and metadata"""
+    return VideoComponents(
+        images=sample_images,
+        audio=sample_audio,
+        frame_rate=Fraction(30),
+        metadata={"test": "metadata"},
+    )
+
+
+def create_test_video(width=4, height=4, frames=3, fps=30):
+    """Helper to create a temporary video file"""
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    with av.open(tmp.name, mode="w") as container:
+        stream = container.add_stream("h264", rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = "yuv420p"
+
+        for i in range(frames):
+            frame = av.VideoFrame.from_ndarray(
+                torch.ones(height, width, 3, dtype=torch.uint8).numpy() * (i * 85),
+                format="rgb24",
+            )
+            frame = frame.reformat(format="yuv420p")
+            packet = stream.encode(frame)
+            container.mux(packet)
+
+        # Flush
+        packet = stream.encode(None)
+        container.mux(packet)
+
+    return tmp.name
+
+
+@pytest.fixture
+def simple_video_file():
+    """4x4 video with 3 frames at 30fps"""
+    file_path = create_test_video()
+    yield file_path
+    os.unlink(file_path)
+
+
+def test_video_from_components_get_duration(video_components):
+    """Duration calculated correctly from frame count and frame rate"""
+    video = VideoFromComponents(video_components)
+    duration = video.get_duration()
+
+    expected_duration = 3.0 / 30.0
+    assert duration == pytest.approx(expected_duration)
+
+
+def test_video_from_components_get_duration_different_frame_rates(sample_images):
+    """Duration correct for different frame rates including fractional"""
+    # Test with 60 fps
+    components_60fps = VideoComponents(images=sample_images, frame_rate=Fraction(60))
+    video_60fps = VideoFromComponents(components_60fps)
+    assert video_60fps.get_duration() == pytest.approx(3.0 / 60.0)
+
+    # Test with fractional frame rate (23.976fps)
+    components_frac = VideoComponents(
+        images=sample_images, frame_rate=Fraction(24000, 1001)
+    )
+    video_frac = VideoFromComponents(components_frac)
+    expected_frac = 3.0 / (24000.0 / 1001.0)
+    assert video_frac.get_duration() == pytest.approx(expected_frac)
+
+
+def test_video_from_components_get_duration_empty_video():
+    """Duration is zero for empty video"""
+    empty_components = VideoComponents(
+        images=torch.zeros(0, 2, 2, 3), frame_rate=Fraction(30)
+    )
+    video = VideoFromComponents(empty_components)
+    assert video.get_duration() == 0.0
+
+
+def test_video_from_components_get_dimensions(video_components):
+    """Dimensions returned correctly from image tensor shape"""
+    video = VideoFromComponents(video_components)
+    width, height = video.get_dimensions()
+    assert width == 2
+    assert height == 2
+
+
+def test_video_from_file_get_duration(simple_video_file):
+    """Duration extracted from file metadata"""
+    video = VideoFromFile(simple_video_file)
+    duration = video.get_duration()
+    assert duration == pytest.approx(0.1, abs=0.01)
+
+
+def test_video_from_file_get_dimensions(simple_video_file):
+    """Dimensions read from stream without decoding frames"""
+    video = VideoFromFile(simple_video_file)
+    width, height = video.get_dimensions()
+    assert width == 4
+    assert height == 4
+
+
+def test_video_from_file_bytesio_input():
+    """VideoFromFile works with BytesIO input"""
+    buffer = io.BytesIO()
+    with av.open(buffer, mode="w", format="mp4") as container:
+        stream = container.add_stream("h264", rate=30)
+        stream.width = 2
+        stream.height = 2
+        stream.pix_fmt = "yuv420p"
+
+        frame = av.VideoFrame.from_ndarray(
+            torch.zeros(2, 2, 3, dtype=torch.uint8).numpy(), format="rgb24"
+        )
+        frame = frame.reformat(format="yuv420p")
+        packet = stream.encode(frame)
+        container.mux(packet)
+        packet = stream.encode(None)
+        container.mux(packet)
+
+    buffer.seek(0)
+    video = VideoFromFile(buffer)
+
+    assert video.get_dimensions() == (2, 2)
+    assert video.get_duration() == pytest.approx(1 / 30, abs=0.01)
+
+
+def test_video_from_file_invalid_file_error():
+    """InvalidDataError raised for non-video files"""
+    with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp:
+        tmp.write(b"not a video file")
+        tmp.flush()
+        tmp_name = tmp.name
+
+    try:
+        with pytest.raises(InvalidDataError):
+            video = VideoFromFile(tmp_name)
+            video.get_dimensions()
+    finally:
+        os.unlink(tmp_name)
+
+
+def test_video_from_file_audio_only_error():
+    """ValueError raised for audio-only files"""
+    with tempfile.NamedTemporaryFile(suffix=".m4a", delete=False) as tmp:
+        tmp_name = tmp.name
+
+    try:
+        with av.open(tmp_name, mode="w") as container:
+            stream = container.add_stream("aac", rate=44100)
+            stream.sample_rate = 44100
+            stream.format = "fltp"
+
+            audio_data = torch.zeros(1, 1024).numpy()
+            audio_frame = av.AudioFrame.from_ndarray(
+                audio_data, format="fltp", layout="mono"
+            )
+            audio_frame.sample_rate = 44100
+            audio_frame.pts = 0
+            packet = stream.encode(audio_frame)
+            container.mux(packet)
+
+            for packet in stream.encode(None):
+                container.mux(packet)
+
+        with pytest.raises(ValueError, match="No video stream found"):
+            video = VideoFromFile(tmp_name)
+            video.get_dimensions()
+    finally:
+        os.unlink(tmp_name)
+
+
+def test_single_frame_video():
+    """Single frame video has correct duration"""
+    components = VideoComponents(
+        images=torch.rand(1, 10, 10, 3), frame_rate=Fraction(1)
+    )
+    video = VideoFromComponents(components)
+    assert video.get_duration() == 1.0
+
+
+@pytest.mark.parametrize(
+    "frame_rate,expected_fps",
+    [
+        (Fraction(24000, 1001), 24000 / 1001),
+        (Fraction(30000, 1001), 30000 / 1001),
+        (Fraction(25, 1), 25.0),
+        (Fraction(50, 2), 25.0),
+    ],
+)
+def test_fractional_frame_rates(frame_rate, expected_fps):
+    """Duration calculated correctly for various fractional frame rates"""
+    components = VideoComponents(images=torch.rand(100, 4, 4, 3), frame_rate=frame_rate)
+    video = VideoFromComponents(components)
+    duration = video.get_duration()
+    expected_duration = 100.0 / expected_fps
+    assert duration == pytest.approx(expected_duration)
+
+
+def test_duration_consistency(video_components):
+    """get_duration() consistent with manual calculation from components"""
+    video = VideoFromComponents(video_components)
+
+    duration = video.get_duration()
+    components = video.get_components()
+    manual_duration = float(components.images.shape[0] / components.frame_rate)
+
+    assert duration == pytest.approx(manual_duration)
Author	SHA1	Message	Date
catboxanon	0609bc160b	Merge 1ffed0f41cd733237f409afaba0a9ca40c8b50b5 into aee2908d0395577a6e2e13d1307aaf271424108b	2025-05-17 23:11:03 -04:00
comfyanonymous	aee2908d03	Remove useless log. (#8166 )	2025-05-17 06:27:34 -04:00
comfyanonymous	dc46db7aa4	Make ImagePadForOutpaint return a 3 channel mask. (#8157 )	2025-05-16 15:15:55 -04:00
filtered	7046983d95	Remove Desktop versioning claim from README (#8155 )	2025-05-16 10:45:36 -07:00
comfyanonymous	1c2d45d2b5	Fix typo in last PR. (#8144 ) More robust model detection for future proofing.	2025-05-15 19:02:19 -04:00
George0726	c820ef950d	Add Wan-FUN Camera Control models and Add WanCameraImageToVideo node (#8013 ) * support wan camera models * fix by ruff check * change camera_condition type; make camera_condition optional * support camera trajectory nodes * fix camera direction --------- Co-authored-by: Qirui Sun <sunqr0667@126.com>	2025-05-15 19:00:43 -04:00
comfyanonymous	6a2e4bb9e0	Remove old hack used to fix windows pytorch 2.4 on the portable. (#8139 ) Not necessary anymore.	2025-05-15 08:21:47 -04:00
Christian Byrne	f1f9763b4c	Add `get_duration` method to Comfy VIDEO type (#8122 ) * get duration from VIDEO type * video get_duration unit test * fix Windows unit test: can't delete opened temp file	2025-05-15 00:11:41 -04:00
comfyanonymous	08368f8e00	Update comment on ROCm pytorch attention in README. (#8123 )	2025-05-14 17:54:50 -04:00
catboxanon	1ffed0f41c	Remove unused PngImagePlugin import	2025-04-14 12:53:47 -04:00
catboxanon	8060017fe0	Fix extra chunk metadata add	2025-04-14 12:52:52 -04:00
catboxanon	5d6bca8e78	Remove extra chunks variable	2025-04-14 12:51:50 -04:00
catboxanon	5ad716d8c8	Remove extra newlines	2025-04-14 12:51:15 -04:00
catboxanon	54c99a0d02	Merge branch 'master' into feat/png-cicp-chunk	2025-04-14 12:50:11 -04:00
catboxanon	3a18b3136c	Remove cICP chunk hack Now supported in Pillow as of 11.2.1 https://github.com/python-pillow/Pillow/pull/8704	2025-04-14 12:47:12 -04:00
catboxanon	49aaad7c21	Only add PNG chunks if metadata is enabled Prevents a hypothetical scenario of a VAE being crafted to inject undesired data into PNG chunks.	2025-01-31 11:40:28 -05:00
catboxanon	a4aba18d29	PNG cICP chunk support	2025-01-30 15:44:41 -05:00