From 68f0d3529667a2b34b27cc0ac5051bc0e8c45b49 Mon Sep 17 00:00:00 2001 From: guill Date: Tue, 29 Apr 2025 02:58:00 -0700 Subject: [PATCH] Add support for VIDEO as a built-in type (#7844) * Add basic support for videos as types This PR adds support for VIDEO as first-class types. In order to avoid unnecessary costs, VIDEO outputs must implement the `VideoInput` ABC, but their implementation details can vary. Included are two implementations of this type which can be returned by other nodes: * `VideoFromFile` - Created with either a path on disk (as a string) or a `io.BytesIO` containing the contents of a file in a supported format (like .mp4). This implementation won't actually load the video unless necessary. It will also avoid re-encoding when saving if possible. * `VideoFromComponents` - Created from an image tensor and an optional audio tensor. Currently, only h264 encoded videos in .mp4 containers are supported for saving, but the plan is to add additional encodings/containers in the near future (particularly .webm). * Add optimization to avoid parsing entire video * Improve type declarations to reduce warnings * Make sure bytesIO objects can be read many times * Fix a potential issue when saving long videos * Fix incorrect type annotation * Add a `LoadVideo` node to make testing easier * Refactor new types out of the base comfy folder I've created a new `comfy_api` top-level module. The intention is that anything within this folder would be covered by semver-style versioning that would allow custom nodes to rely on them not introducing breaking changes. * Fix linting issue --- comfy/comfy_types/node_typing.py | 9 +- comfy_api/input/__init__.py | 8 + comfy_api/input/basic_types.py | 20 +++ comfy_api/input/video_types.py | 45 ++++++ comfy_api/input_impl/__init__.py | 7 + comfy_api/input_impl/video_types.py | 224 ++++++++++++++++++++++++++++ comfy_api/util/__init__.py | 8 + comfy_api/util/video_types.py | 51 +++++++ comfy_extras/nodes_video.py | 164 +++++++++++++++++++- folder_paths.py | 4 +- 10 files changed, 532 insertions(+), 8 deletions(-) create mode 100644 comfy_api/input/__init__.py create mode 100644 comfy_api/input/basic_types.py create mode 100644 comfy_api/input/video_types.py create mode 100644 comfy_api/input_impl/__init__.py create mode 100644 comfy_api/input_impl/video_types.py create mode 100644 comfy_api/util/__init__.py create mode 100644 comfy_api/util/video_types.py diff --git a/comfy/comfy_types/node_typing.py b/comfy/comfy_types/node_typing.py index 4ceeb3468..2ffc9c021 100644 --- a/comfy/comfy_types/node_typing.py +++ b/comfy/comfy_types/node_typing.py @@ -48,6 +48,7 @@ class IO(StrEnum): FACE_ANALYSIS = "FACE_ANALYSIS" BBOX = "BBOX" SEGS = "SEGS" + VIDEO = "VIDEO" ANY = "*" """Always matches any type, but at a price. @@ -273,7 +274,7 @@ class ComfyNodeABC(ABC): Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing """ - OUTPUT_IS_LIST: tuple[bool] + OUTPUT_IS_LIST: tuple[bool, ...] """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items. Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list. @@ -292,7 +293,7 @@ class ComfyNodeABC(ABC): Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing """ - RETURN_TYPES: tuple[IO] + RETURN_TYPES: tuple[IO, ...] """A tuple representing the outputs of this node. Usage:: @@ -301,12 +302,12 @@ class ComfyNodeABC(ABC): Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types """ - RETURN_NAMES: tuple[str] + RETURN_NAMES: tuple[str, ...] """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")`` Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names """ - OUTPUT_TOOLTIPS: tuple[str] + OUTPUT_TOOLTIPS: tuple[str, ...] """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`.""" FUNCTION: str """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"` diff --git a/comfy_api/input/__init__.py b/comfy_api/input/__init__.py new file mode 100644 index 000000000..66667946f --- /dev/null +++ b/comfy_api/input/__init__.py @@ -0,0 +1,8 @@ +from .basic_types import ImageInput, AudioInput +from .video_types import VideoInput + +__all__ = [ + "ImageInput", + "AudioInput", + "VideoInput", +] diff --git a/comfy_api/input/basic_types.py b/comfy_api/input/basic_types.py new file mode 100644 index 000000000..033fb7e27 --- /dev/null +++ b/comfy_api/input/basic_types.py @@ -0,0 +1,20 @@ +import torch +from typing import TypedDict + +ImageInput = torch.Tensor +""" +An image in format [B, H, W, C] where B is the batch size, C is the number of channels, +""" + +class AudioInput(TypedDict): + """ + TypedDict representing audio input. + """ + + waveform: torch.Tensor + """ + Tensor in the format [B, C, T] where B is the batch size, C is the number of channels, + """ + + sample_rate: int + diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py new file mode 100644 index 000000000..0676e0e66 --- /dev/null +++ b/comfy_api/input/video_types.py @@ -0,0 +1,45 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import Optional +from comfy_api.util import VideoContainer, VideoCodec, VideoComponents + +class VideoInput(ABC): + """ + Abstract base class for video input types. + """ + + @abstractmethod + def get_components(self) -> VideoComponents: + """ + Abstract method to get the video components (images, audio, and frame rate). + + Returns: + VideoComponents containing images, audio, and frame rate + """ + pass + + @abstractmethod + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + """ + Abstract method to save the video input to a file. + """ + pass + + # Provide a default implementation, but subclasses can provide optimized versions + # if possible. + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + components = self.get_components() + return components.images.shape[2], components.images.shape[1] + diff --git a/comfy_api/input_impl/__init__.py b/comfy_api/input_impl/__init__.py new file mode 100644 index 000000000..02901b8b9 --- /dev/null +++ b/comfy_api/input_impl/__init__.py @@ -0,0 +1,7 @@ +from .video_types import VideoFromFile, VideoFromComponents + +__all__ = [ + # Implementations + "VideoFromFile", + "VideoFromComponents", +] diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py new file mode 100644 index 000000000..12e5783db --- /dev/null +++ b/comfy_api/input_impl/video_types.py @@ -0,0 +1,224 @@ +from __future__ import annotations +from av.container import InputContainer +from av.subtitles.stream import SubtitleStream +from fractions import Fraction +from typing import Optional +from comfy_api.input import AudioInput +import av +import io +import json +import numpy as np +import torch +from comfy_api.input import VideoInput +from comfy_api.util import VideoContainer, VideoCodec, VideoComponents + +class VideoFromFile(VideoInput): + """ + Class representing video input from a file. + """ + + def __init__(self, file: str | io.BytesIO): + """ + Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object + containing the file contents. + """ + self.__file = file + + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + for stream in container.streams: + if stream.type == 'video': + assert isinstance(stream, av.VideoStream) + return stream.width, stream.height + raise ValueError(f"No video stream found in file '{self.__file}'") + + def get_components_internal(self, container: InputContainer) -> VideoComponents: + # Get video frames + frames = [] + for frame in container.decode(video=0): + img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) + img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) + frames.append(img) + + images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) + + # Get frame rate + video_stream = next(s for s in container.streams if s.type == 'video') + frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) + + # Get audio if available + audio = None + try: + container.seek(0) # Reset the container to the beginning + for stream in container.streams: + if stream.type != 'audio': + continue + assert isinstance(stream, av.AudioStream) + audio_frames = [] + for packet in container.demux(stream): + for frame in packet.decode(): + assert isinstance(frame, av.AudioFrame) + audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) + if len(audio_frames) > 0: + audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) + audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) + audio = AudioInput({ + "waveform": audio_tensor, + "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, + }) + except StopIteration: + pass # No audio stream + + metadata = container.metadata + return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) + + def get_components(self) -> VideoComponents: + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + return self.get_components_internal(container) + raise ValueError(f"No video stream found in file '{self.__file}'") + + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + container_format = container.format.name + video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None + reuse_streams = True + if format != VideoContainer.AUTO and format not in container_format.split(","): + reuse_streams = False + if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: + reuse_streams = False + + if not reuse_streams: + components = self.get_components_internal(container) + video = VideoFromComponents(components) + return video.save_to( + path, + format=format, + codec=codec, + metadata=metadata + ) + + streams = container.streams + with av.open(path, mode='w', options={"movflags": "use_metadata_tags"}) as output_container: + # Copy over the original metadata + for key, value in container.metadata.items(): + if metadata is None or key not in metadata: + output_container.metadata[key] = value + + # Add our new metadata + if metadata is not None: + for key, value in metadata.items(): + if isinstance(value, str): + output_container.metadata[key] = value + else: + output_container.metadata[key] = json.dumps(value) + + # Add streams to the new container + stream_map = {} + for stream in streams: + if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)): + out_stream = output_container.add_stream_from_template(template=stream, opaque=True) + stream_map[stream] = out_stream + + # Write packets to the new container + for packet in container.demux(): + if packet.stream in stream_map and packet.dts is not None: + packet.stream = stream_map[packet.stream] + output_container.mux(packet) + +class VideoFromComponents(VideoInput): + """ + Class representing video input from tensors. + """ + + def __init__(self, components: VideoComponents): + self.__components = components + + def get_components(self) -> VideoComponents: + return VideoComponents( + images=self.__components.images, + audio=self.__components.audio, + frame_rate=self.__components.frame_rate + ) + + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if format != VideoContainer.AUTO and format != VideoContainer.MP4: + raise ValueError("Only MP4 format is supported for now") + if codec != VideoCodec.AUTO and codec != VideoCodec.H264: + raise ValueError("Only H264 codec is supported for now") + with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: + # Add metadata before writing any streams + if metadata is not None: + for key, value in metadata.items(): + output.metadata[key] = json.dumps(value) + + frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000) + # Create a video stream + video_stream = output.add_stream('h264', rate=frame_rate) + video_stream.width = self.__components.images.shape[2] + video_stream.height = self.__components.images.shape[1] + video_stream.pix_fmt = 'yuv420p' + + # Create an audio stream + audio_sample_rate = 1 + audio_stream: Optional[av.AudioStream] = None + if self.__components.audio: + audio_sample_rate = int(self.__components.audio['sample_rate']) + audio_stream = output.add_stream('aac', rate=audio_sample_rate) + audio_stream.sample_rate = audio_sample_rate + audio_stream.format = 'fltp' + + # Encode video + for i, frame in enumerate(self.__components.images): + img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3) + frame = av.VideoFrame.from_ndarray(img, format='rgb24') + frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264 + packet = video_stream.encode(frame) + output.mux(packet) + + # Flush video + packet = video_stream.encode(None) + output.mux(packet) + + if audio_stream and self.__components.audio: + # Encode audio + samples_per_frame = int(audio_sample_rate / frame_rate) + num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame + for i in range(num_frames): + start = i * samples_per_frame + end = start + samples_per_frame + # TODO(Feature) - Add support for stereo audio + chunk = self.__components.audio['waveform'][0, 0, start:end].unsqueeze(0).numpy() + audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') + audio_frame.sample_rate = audio_sample_rate + audio_frame.pts = i * samples_per_frame + for packet in audio_stream.encode(audio_frame): + output.mux(packet) + + # Flush audio + for packet in audio_stream.encode(None): + output.mux(packet) + diff --git a/comfy_api/util/__init__.py b/comfy_api/util/__init__.py new file mode 100644 index 000000000..9019c46db --- /dev/null +++ b/comfy_api/util/__init__.py @@ -0,0 +1,8 @@ +from .video_types import VideoContainer, VideoCodec, VideoComponents + +__all__ = [ + # Utility Types + "VideoContainer", + "VideoCodec", + "VideoComponents", +] diff --git a/comfy_api/util/video_types.py b/comfy_api/util/video_types.py new file mode 100644 index 000000000..d09663db9 --- /dev/null +++ b/comfy_api/util/video_types.py @@ -0,0 +1,51 @@ +from __future__ import annotations +from dataclasses import dataclass +from enum import Enum +from fractions import Fraction +from typing import Optional +from comfy_api.input import ImageInput, AudioInput + +class VideoCodec(str, Enum): + AUTO = "auto" + H264 = "h264" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of codec names that can be used as node input. + """ + return [member.value for member in cls] + +class VideoContainer(str, Enum): + AUTO = "auto" + MP4 = "mp4" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of container names that can be used as node input. + """ + return [member.value for member in cls] + + @classmethod + def get_extension(cls, value) -> str: + """ + Returns the file extension for the container. + """ + if isinstance(value, str): + value = cls(value) + if value == VideoContainer.MP4 or value == VideoContainer.AUTO: + return "mp4" + return "" + +@dataclass +class VideoComponents: + """ + Dataclass representing the components of a video. + """ + + images: ImageInput + frame_rate: Fraction + audio: Optional[AudioInput] = None + metadata: Optional[dict] = None + diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index a9e244ebe..61f7171b2 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -5,9 +5,13 @@ import av import torch import folder_paths import json +from typing import Optional, Literal from fractions import Fraction -from comfy.comfy_types import FileLocator - +from comfy.comfy_types import IO, FileLocator, ComfyNodeABC +from comfy_api.input import ImageInput, AudioInput, VideoInput +from comfy_api.util import VideoContainer, VideoCodec, VideoComponents +from comfy_api.input_impl import VideoFromFile, VideoFromComponents +from comfy.cli_args import args class SaveWEBM: def __init__(self): @@ -75,7 +79,163 @@ class SaveWEBM: return {"ui": {"images": results, "animated": (True,)}} # TODO: frontend side +class SaveVideo(ComfyNodeABC): + def __init__(self): + self.output_dir = folder_paths.get_output_directory() + self.type: Literal["output"] = "output" + self.prefix_append = "" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "video": (IO.VIDEO, {"tooltip": "The video to save."}), + "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}), + "format": (VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), + "codec": (VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), + }, + "hidden": { + "prompt": "PROMPT", + "extra_pnginfo": "EXTRA_PNGINFO" + }, + } + + RETURN_TYPES = () + FUNCTION = "save_video" + + OUTPUT_NODE = True + + CATEGORY = "image/video" + DESCRIPTION = "Saves the input images to your ComfyUI output directory." + + def save_video(self, video: VideoInput, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): + filename_prefix += self.prefix_append + width, height = video.get_dimensions() + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( + filename_prefix, + self.output_dir, + width, + height + ) + results: list[FileLocator] = list() + saved_metadata = None + if not args.disable_metadata: + metadata = {} + if extra_pnginfo is not None: + metadata.update(extra_pnginfo) + if prompt is not None: + metadata["prompt"] = prompt + if len(metadata) > 0: + saved_metadata = metadata + file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" + video.save_to( + os.path.join(full_output_folder, file), + format=format, + codec=codec, + metadata=saved_metadata + ) + + results.append({ + "filename": file, + "subfolder": subfolder, + "type": self.type + }) + counter += 1 + + return { "ui": { "images": results, "animated": (True,) } } + +class CreateVideo(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "images": (IO.IMAGE, {"tooltip": "The images to create a video from."}), + "fps": ("FLOAT", {"default": 30.0, "min": 1.0, "max": 120.0, "step": 1.0}), + }, + "optional": { + "audio": (IO.AUDIO, {"tooltip": "The audio to add to the video."}), + } + } + + RETURN_TYPES = (IO.VIDEO,) + FUNCTION = "create_video" + + CATEGORY = "image/video" + DESCRIPTION = "Create a video from images." + + def create_video(self, images: ImageInput, fps: float, audio: Optional[AudioInput] = None): + return (VideoFromComponents( + VideoComponents( + images=images, + audio=audio, + frame_rate=Fraction(fps), + ) + ),) + +class GetVideoComponents(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "video": (IO.VIDEO, {"tooltip": "The video to extract components from."}), + } + } + RETURN_TYPES = (IO.IMAGE, IO.AUDIO, IO.FLOAT) + RETURN_NAMES = ("images", "audio", "fps") + FUNCTION = "get_components" + + CATEGORY = "image/video" + DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate." + + def get_components(self, video: VideoInput): + components = video.get_components() + + return (components.images, components.audio, float(components.frame_rate)) + +class LoadVideo(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + input_dir = folder_paths.get_input_directory() + files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] + files = folder_paths.filter_files_content_types(files, ["video"]) + return {"required": + {"file": (sorted(files), {"video_upload": True})}, + } + + CATEGORY = "image/video" + + RETURN_TYPES = (IO.VIDEO,) + FUNCTION = "load_video" + def load_video(self, file): + video_path = folder_paths.get_annotated_filepath(file) + return (VideoFromFile(video_path),) + + @classmethod + def IS_CHANGED(cls, file): + video_path = folder_paths.get_annotated_filepath(file) + mod_time = os.path.getmtime(video_path) + # Instead of hashing the file, we can just use the modification time to avoid + # rehashing large files. + return mod_time + + @classmethod + def VALIDATE_INPUTS(cls, file): + if not folder_paths.exists_annotated_filepath(file): + return "Invalid video file: {}".format(file) + + return True NODE_CLASS_MAPPINGS = { "SaveWEBM": SaveWEBM, + "SaveVideo": SaveVideo, + "CreateVideo": CreateVideo, + "GetVideoComponents": GetVideoComponents, + "LoadVideo": LoadVideo, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "SaveVideo": "Save Video", + "CreateVideo": "Create Video", + "GetVideoComponents": "Get Video Components", + "LoadVideo": "Load Video", } diff --git a/folder_paths.py b/folder_paths.py index 9a525e5a1..f0b3fd103 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -4,7 +4,7 @@ import os import time import mimetypes import logging -from typing import Literal +from typing import Literal, List from collections.abc import Collection from comfy.cli_args import args @@ -141,7 +141,7 @@ def get_directory_by_type(type_name: str) -> str | None: return get_input_directory() return None -def filter_files_content_types(files: list[str], content_types: Literal["image", "video", "audio", "model"]) -> list[str]: +def filter_files_content_types(files: list[str], content_types: List[Literal["image", "video", "audio", "model"]]) -> list[str]: """ Example: files = os.listdir(folder_paths.get_input_directory())