Add support for VIDEO as a built-in type (#7844)

* Add basic support for videos as types This PR adds support for VIDEO as first-class types. In order to avoid unnecessary costs, VIDEO outputs must implement the `VideoInput` ABC, but their implementation details can vary. Included are two implementations of this type which can be returned by other nodes: * `VideoFromFile` - Created with either a path on disk (as a string) or a `io.BytesIO` containing the contents of a file in a supported format (like .mp4). This implementation won't actually load the video unless necessary. It will also avoid re-encoding when saving if possible. * `VideoFromComponents` - Created from an image tensor and an optional audio tensor. Currently, only h264 encoded videos in .mp4 containers are supported for saving, but the plan is to add additional encodings/containers in the near future (particularly .webm). * Add optimization to avoid parsing entire video * Improve type declarations to reduce warnings * Make sure bytesIO objects can be read many times * Fix a potential issue when saving long videos * Fix incorrect type annotation * Add a `LoadVideo` node to make testing easier * Refactor new types out of the base comfy folder I've created a new `comfy_api` top-level module. The intention is that anything within this folder would be covered by semver-style versioning that would allow custom nodes to rely on them not introducing breaking changes. * Fix linting issue
2025-06-02 01:22:11 +08:00 · 2025-04-29 02:58:00 -07:00 · 2025-04-29 02:58:00 -07:00 · 68f0d35296
commit 68f0d35296
parent 83d04717b6
10 changed files with 532 additions and 8 deletions
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -48,6 +48,7 @@ class IO(StrEnum):
    FACE_ANALYSIS = "FACE_ANALYSIS"
    BBOX = "BBOX"
    SEGS = "SEGS"
    VIDEO = "VIDEO"
    ANY = "*"
    """Always matches any type, but at a price.
@ -273,7 +274,7 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """
-    OUTPUT_IS_LIST: tuple[bool]
+    OUTPUT_IS_LIST: tuple[bool, ...]
    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
@ -292,7 +293,7 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """
-    RETURN_TYPES: tuple[IO]
+    RETURN_TYPES: tuple[IO, ...]
    """A tuple representing the outputs of this node.
    Usage::
@ -301,12 +302,12 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
    """
-    RETURN_NAMES: tuple[str]
+    RETURN_NAMES: tuple[str, ...]
    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
    """
-    OUTPUT_TOOLTIPS: tuple[str]
+    OUTPUT_TOOLTIPS: tuple[str, ...]
    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
    FUNCTION: str
    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`
--- a/comfy_api/input/init.py
+++ b/comfy_api/input/init.py
@ -0,0 +1,8 @@
 from .basic_types import ImageInput, AudioInput
 from .video_types import VideoInput
 __all__ = [
    "ImageInput",
    "AudioInput",
    "VideoInput",
 ]
--- a/comfy_api/input/basic_types.py
+++ b/comfy_api/input/basic_types.py
@ -0,0 +1,20 @@
 import torch
 from typing import TypedDict
 ImageInput = torch.Tensor
 """
 An image in format [B, H, W, C] where B is the batch size, C is the number of channels,
 """
 class AudioInput(TypedDict):
    """
    TypedDict representing audio input.
    """
    waveform: torch.Tensor
    """
    Tensor in the format [B, C, T] where B is the batch size, C is the number of channels,
    """
    sample_rate: int
--- a/comfy_api/input/video_types.py
+++ b/comfy_api/input/video_types.py
@ -0,0 +1,45 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Optional
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
 class VideoInput(ABC):
    """
    Abstract base class for video input types.
    """
    @abstractmethod
    def get_components(self) -> VideoComponents:
        """
        Abstract method to get the video components (images, audio, and frame rate).
        Returns:
            VideoComponents containing images, audio, and frame rate
        """
        pass
    @abstractmethod
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        """
        Abstract method to save the video input to a file.
        """
        pass
    # Provide a default implementation, but subclasses can provide optimized versions
    # if possible.
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        components = self.get_components()
        return components.images.shape[2], components.images.shape[1]
--- a/comfy_api/input_impl/init.py
+++ b/comfy_api/input_impl/init.py
@ -0,0 +1,7 @@
 from .video_types import VideoFromFile, VideoFromComponents
 __all__ = [
    # Implementations
    "VideoFromFile",
    "VideoFromComponents",
 ]
--- a/comfy_api/input_impl/video_types.py
+++ b/comfy_api/input_impl/video_types.py
@ -0,0 +1,224 @@
 from __future__ import annotations
 from av.container import InputContainer
 from av.subtitles.stream import SubtitleStream
 from fractions import Fraction
 from typing import Optional
 from comfy_api.input import AudioInput
 import av
 import io
 import json
 import numpy as np
 import torch
 from comfy_api.input import VideoInput
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
 class VideoFromFile(VideoInput):
    """
    Class representing video input from a file.
    """
    def __init__(self, file: str | io.BytesIO):
        """
        Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
        containing the file contents.
        """
        self.__file = file
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
        Returns:
            Tuple of (width, height)
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            for stream in container.streams:
                if stream.type == 'video':
                    assert isinstance(stream, av.VideoStream)
                    return stream.width, stream.height
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def get_components_internal(self, container: InputContainer) -> VideoComponents:
        # Get video frames
        frames = []
        for frame in container.decode(video=0):
            img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
            img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
            frames.append(img)
        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
        # Get frame rate
        video_stream = next(s for s in container.streams if s.type == 'video')
        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
        # Get audio if available
        audio = None
        try:
            container.seek(0)  # Reset the container to the beginning
            for stream in container.streams:
                if stream.type != 'audio':
                    continue
                assert isinstance(stream, av.AudioStream)
                audio_frames = []
                for packet in container.demux(stream):
                    for frame in packet.decode():
                        assert isinstance(frame, av.AudioFrame)
                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
                if len(audio_frames) > 0:
                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
                    audio = AudioInput({
                        "waveform": audio_tensor,
                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
                    })
        except StopIteration:
            pass  # No audio stream
        metadata = container.metadata
        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
    def get_components(self) -> VideoComponents:
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            return self.get_components_internal(container)
        raise ValueError(f"No video stream found in file '{self.__file}'")
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
            container_format = container.format.name
            video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None
            reuse_streams = True
            if format != VideoContainer.AUTO and format not in container_format.split(","):
                reuse_streams = False
            if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
                reuse_streams = False
            if not reuse_streams:
                components = self.get_components_internal(container)
                video = VideoFromComponents(components)
                return video.save_to(
                    path,
                    format=format,
                    codec=codec,
                    metadata=metadata
                )
            streams = container.streams
            with av.open(path, mode='w', options={"movflags": "use_metadata_tags"}) as output_container:
                # Copy over the original metadata
                for key, value in container.metadata.items():
                    if metadata is None or key not in metadata:
                        output_container.metadata[key] = value
                # Add our new metadata
                if metadata is not None:
                    for key, value in metadata.items():
                        if isinstance(value, str):
                            output_container.metadata[key] = value
                        else:
                            output_container.metadata[key] = json.dumps(value)
                # Add streams to the new container
                stream_map = {}
                for stream in streams:
                    if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)):
                        out_stream = output_container.add_stream_from_template(template=stream, opaque=True)
                        stream_map[stream] = out_stream
                # Write packets to the new container
                for packet in container.demux():
                    if packet.stream in stream_map and packet.dts is not None:
                        packet.stream = stream_map[packet.stream]
                        output_container.mux(packet)
 class VideoFromComponents(VideoInput):
    """
    Class representing video input from tensors.
    """
    def __init__(self, components: VideoComponents):
        self.__components = components
    def get_components(self) -> VideoComponents:
        return VideoComponents(
            images=self.__components.images,
            audio=self.__components.audio,
            frame_rate=self.__components.frame_rate
        )
    def save_to(
        self,
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
            raise ValueError("Only MP4 format is supported for now")
        if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
            raise ValueError("Only H264 codec is supported for now")
        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
            # Add metadata before writing any streams
            if metadata is not None:
                for key, value in metadata.items():
                    output.metadata[key] = json.dumps(value)
            frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
            # Create a video stream
            video_stream = output.add_stream('h264', rate=frame_rate)
            video_stream.width = self.__components.images.shape[2]
            video_stream.height = self.__components.images.shape[1]
            video_stream.pix_fmt = 'yuv420p'
            # Create an audio stream
            audio_sample_rate = 1
            audio_stream: Optional[av.AudioStream] = None
            if self.__components.audio:
                audio_sample_rate = int(self.__components.audio['sample_rate'])
                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
                audio_stream.sample_rate = audio_sample_rate
                audio_stream.format = 'fltp'
            # Encode video
            for i, frame in enumerate(self.__components.images):
                img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
                frame = av.VideoFrame.from_ndarray(img, format='rgb24')
                frame = frame.reformat(format='yuv420p')  # Convert to YUV420P as required by h264
                packet = video_stream.encode(frame)
                output.mux(packet)
            # Flush video
            packet = video_stream.encode(None)
            output.mux(packet)
            if audio_stream and self.__components.audio:
                # Encode audio
                samples_per_frame = int(audio_sample_rate / frame_rate)
                num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame
                for i in range(num_frames):
                    start = i * samples_per_frame
                    end = start + samples_per_frame
                    # TODO(Feature) - Add support for stereo audio
                    chunk = self.__components.audio['waveform'][0, 0, start:end].unsqueeze(0).numpy()
                    audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
                    audio_frame.sample_rate = audio_sample_rate
                    audio_frame.pts = i * samples_per_frame
                    for packet in audio_stream.encode(audio_frame):
                        output.mux(packet)
                # Flush audio
                for packet in audio_stream.encode(None):
                    output.mux(packet)
--- a/comfy_api/util/init.py
+++ b/comfy_api/util/init.py
@ -0,0 +1,8 @@
 from .video_types import VideoContainer, VideoCodec, VideoComponents
 __all__ = [
    # Utility Types
    "VideoContainer",
    "VideoCodec",
    "VideoComponents",
 ]
--- a/comfy_api/util/video_types.py
+++ b/comfy_api/util/video_types.py
@ -0,0 +1,51 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
 from fractions import Fraction
 from typing import Optional
 from comfy_api.input import ImageInput, AudioInput
 class VideoCodec(str, Enum):
    AUTO = "auto"
    H264 = "h264"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of codec names that can be used as node input.
        """
        return [member.value for member in cls]
 class VideoContainer(str, Enum):
    AUTO = "auto"
    MP4 = "mp4"
    @classmethod
    def as_input(cls) -> list[str]:
        """
        Returns a list of container names that can be used as node input.
        """
        return [member.value for member in cls]
    @classmethod
    def get_extension(cls, value) -> str:
        """
        Returns the file extension for the container.
        """
        if isinstance(value, str):
            value = cls(value)
        if value == VideoContainer.MP4 or value == VideoContainer.AUTO:
            return "mp4"
        return ""
@dataclass
 class VideoComponents:
    """
    Dataclass representing the components of a video.
    """
    images: ImageInput
    frame_rate: Fraction
    audio: Optional[AudioInput] = None
    metadata: Optional[dict] = None
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@ -5,9 +5,13 @@ import av
 import torch
 import folder_paths
 import json
 from typing import Optional, Literal
 from fractions import Fraction
-from comfy.comfy_types import FileLocator
+from comfy.comfy_types import IO, FileLocator, ComfyNodeABC
-
+from comfy_api.input import ImageInput, AudioInput, VideoInput
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
 from comfy_api.input_impl import VideoFromFile, VideoFromComponents
 from comfy.cli_args import args
 class SaveWEBM:
    def __init__(self):
@ -75,7 +79,163 @@ class SaveWEBM:
        return {"ui": {"images": results, "animated": (True,)}}  # TODO: frontend side
 class SaveVideo(ComfyNodeABC):
    def __init__(self):
        self.output_dir = folder_paths.get_output_directory()
        self.type: Literal["output"] = "output"
        self.prefix_append = ""
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "video": (IO.VIDEO, {"tooltip": "The video to save."}),
                "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}),
                "format": (VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}),
                "codec": (VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}),
            },
            "hidden": {
                "prompt": "PROMPT",
                "extra_pnginfo": "EXTRA_PNGINFO"
            },
        }
    RETURN_TYPES = ()
    FUNCTION = "save_video"
    OUTPUT_NODE = True
    CATEGORY = "image/video"
    DESCRIPTION = "Saves the input images to your ComfyUI output directory."
    def save_video(self, video: VideoInput, filename_prefix, format, codec, prompt=None, extra_pnginfo=None):
        filename_prefix += self.prefix_append
        width, height = video.get_dimensions()
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
            filename_prefix,
            self.output_dir,
            width,
            height
        )
        results: list[FileLocator] = list()
        saved_metadata = None
        if not args.disable_metadata:
            metadata = {}
            if extra_pnginfo is not None:
                metadata.update(extra_pnginfo)
            if prompt is not None:
                metadata["prompt"] = prompt
            if len(metadata) > 0:
                saved_metadata = metadata
        file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}"
        video.save_to(
            os.path.join(full_output_folder, file),
            format=format,
            codec=codec,
            metadata=saved_metadata
        )
        results.append({
            "filename": file,
            "subfolder": subfolder,
            "type": self.type
        })
        counter += 1
        return { "ui": { "images": results, "animated": (True,) } }
 class CreateVideo(ComfyNodeABC):
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "images": (IO.IMAGE, {"tooltip": "The images to create a video from."}),
                "fps": ("FLOAT", {"default": 30.0, "min": 1.0, "max": 120.0, "step": 1.0}),
            },
            "optional": {
                "audio": (IO.AUDIO, {"tooltip": "The audio to add to the video."}),
            }
        }
    RETURN_TYPES = (IO.VIDEO,)
    FUNCTION = "create_video"
    CATEGORY = "image/video"
    DESCRIPTION = "Create a video from images."
    def create_video(self, images: ImageInput, fps: float, audio: Optional[AudioInput] = None):
        return (VideoFromComponents(
            VideoComponents(
            images=images,
            audio=audio,
            frame_rate=Fraction(fps),
            )
        ),)
 class GetVideoComponents(ComfyNodeABC):
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "video": (IO.VIDEO, {"tooltip": "The video to extract components from."}),
            }
        }
    RETURN_TYPES = (IO.IMAGE, IO.AUDIO, IO.FLOAT)
    RETURN_NAMES = ("images", "audio", "fps")
    FUNCTION = "get_components"
    CATEGORY = "image/video"
    DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate."
    def get_components(self, video: VideoInput):
        components = video.get_components()
        return (components.images, components.audio, float(components.frame_rate))
 class LoadVideo(ComfyNodeABC):
    @classmethod
    def INPUT_TYPES(cls):
        input_dir = folder_paths.get_input_directory()
        files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
        files = folder_paths.filter_files_content_types(files, ["video"])
        return {"required":
                    {"file": (sorted(files), {"video_upload": True})},
                }
    CATEGORY = "image/video"
    RETURN_TYPES = (IO.VIDEO,)
    FUNCTION = "load_video"
    def load_video(self, file):
        video_path = folder_paths.get_annotated_filepath(file)
        return (VideoFromFile(video_path),)
    @classmethod
    def IS_CHANGED(cls, file):
        video_path = folder_paths.get_annotated_filepath(file)
        mod_time = os.path.getmtime(video_path)
        # Instead of hashing the file, we can just use the modification time to avoid
        # rehashing large files.
        return mod_time
    @classmethod
    def VALIDATE_INPUTS(cls, file):
        if not folder_paths.exists_annotated_filepath(file):
            return "Invalid video file: {}".format(file)
        return True
 NODE_CLASS_MAPPINGS = {
    "SaveWEBM": SaveWEBM,
    "SaveVideo": SaveVideo,
    "CreateVideo": CreateVideo,
    "GetVideoComponents": GetVideoComponents,
    "LoadVideo": LoadVideo,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
    "SaveVideo": "Save Video",
    "CreateVideo": "Create Video",
    "GetVideoComponents": "Get Video Components",
    "LoadVideo": "Load Video",
 }
--- a/folder_paths.py
+++ b/folder_paths.py
@ -4,7 +4,7 @@ import os
 import time
 import mimetypes
 import logging
-from typing import Literal
+from typing import Literal, List
 from collections.abc import Collection
 from comfy.cli_args import args
@ -141,7 +141,7 @@ def get_directory_by_type(type_name: str) -> str | None:
        return get_input_directory()
    return None
-def filter_files_content_types(files: list[str], content_types: Literal["image", "video", "audio", "model"]) -> list[str]:
+def filter_files_content_types(files: list[str], content_types: List[Literal["image", "video", "audio", "model"]]) -> list[str]:
    """
    Example:
        files = os.listdir(folder_paths.get_input_directory())