Merge branch 'master' into patch_hooks_improved_memory

2025-07-24 00:17:02 +08:00 · 2024-11-26 21:34:58 -06:00 · 2024-11-26 21:34:58 -06:00 · 57f1ea8532
commit 57f1ea8532
parent 26ccd3b5f9 497db6212f
8 changed files with 248 additions and 8 deletions
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -36,7 +36,7 @@ class UserManager():

        self.settings = AppSettings(self)
        if not os.path.exists(user_directory):
-            os.mkdir(user_directory)
+            os.makedirs(user_directory, exist_ok=True)
            if not args.multi_user:
                print("****** User settings have been changed to be stored on the server instead of browser storage. ******")
                print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
--- a/comfy/cldm/dit_embedder.py
+++ b/comfy/cldm/dit_embedder.py
@ -0,0 +1,122 @@
+import math
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch import Tensor
+
+from comfy.ldm.modules.diffusionmodules.mmdit import DismantledBlock, PatchEmbed, VectorEmbedder, TimestepEmbedder, get_2d_sincos_pos_embed_torch
+
+
+class ControlNetEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        attention_head_dim: int,
+        num_attention_heads: int,
+        adm_in_channels: int,
+        num_layers: int,
+        main_model_double: int,
+        double_y_emb: bool,
+        device: torch.device,
+        dtype: torch.dtype,
+        pos_embed_max_size: Optional[int] = None,
+        operations = None,
+    ):
+        super().__init__()
+        self.main_model_double = main_model_double
+        self.dtype = dtype
+        self.hidden_size = num_attention_heads * attention_head_dim
+        self.patch_size = patch_size
+        self.x_embedder = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=self.hidden_size,
+            strict_img_size=pos_embed_max_size is None,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations)
+
+        self.double_y_emb = double_y_emb
+        if self.double_y_emb:
+            self.orig_y_embedder = VectorEmbedder(
+                adm_in_channels, self.hidden_size, dtype, device, operations=operations
+            )
+            self.y_embedder = VectorEmbedder(
+                self.hidden_size, self.hidden_size, dtype, device, operations=operations
+            )
+        else:
+            self.y_embedder = VectorEmbedder(
+                adm_in_channels, self.hidden_size, dtype, device, operations=operations
+            )
+
+        self.transformer_blocks = nn.ModuleList(
+            DismantledBlock(
+                hidden_size=self.hidden_size, num_heads=num_attention_heads, qkv_bias=True,
+                dtype=dtype, device=device, operations=operations
+            )
+            for _ in range(num_layers)
+        )
+
+        # self.use_y_embedder = pooled_projection_dim != self.time_text_embed.text_embedder.linear_1.in_features
+        # TODO double check this logic when 8b
+        self.use_y_embedder = True
+
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(len(self.transformer_blocks)):
+            controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
+            self.controlnet_blocks.append(controlnet_block)
+
+        self.pos_embed_input = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=self.hidden_size,
+            strict_img_size=False,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        hint = None,
+    ) -> Tuple[Tensor, List[Tensor]]:
+        x_shape = list(x.shape)
+        x = self.x_embedder(x)
+        if not self.double_y_emb:
+            h = (x_shape[-2] + 1) // self.patch_size
+            w = (x_shape[-1] + 1) // self.patch_size
+            x += get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=x.device)
+        c = self.t_embedder(timesteps, dtype=x.dtype)
+        if y is not None and self.y_embedder is not None:
+            if self.double_y_emb:
+                y = self.orig_y_embedder(y)
+            y = self.y_embedder(y)
+            c = c + y
+
+        x = x + self.pos_embed_input(hint)
+
+        block_out = ()
+
+        repeat = math.ceil(self.main_model_double / len(self.transformer_blocks))
+        for i in range(len(self.transformer_blocks)):
+            out = self.transformer_blocks[i](x, c)
+            if not self.double_y_emb:
+                x = out
+            block_out += (self.controlnet_blocks[i](out),) * repeat
+
+        return {"output": block_out}
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -60,8 +60,10 @@ fp_group.add_argument("--force-fp32", action="store_true", help="Force fp32 (If
 fp_group.add_argument("--force-fp16", action="store_true", help="Force fp16.")

 fpunet_group = parser.add_mutually_exclusive_group()
-fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the UNET in bf16. This should only be used for testing stuff.")
-fpunet_group.add_argument("--fp16-unet", action="store_true", help="Store unet weights in fp16.")
+fpunet_group.add_argument("--fp32-unet", action="store_true", help="Run the diffusion model in fp32.")
+fpunet_group.add_argument("--fp64-unet", action="store_true", help="Run the diffusion model in fp64.")
+fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diffusion model in bf16.")
+fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")

--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -35,6 +35,7 @@ import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
+import comfy.cldm.dit_embedder
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.hooks import HookGroup
@ -82,6 +83,7 @@ class ControlBase:
        self.extra_concat_orig = []
        self.extra_concat = None
        self.extra_hooks: HookGroup = None
+        self.preprocess_image = lambda a: a

    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
@ -142,6 +144,7 @@ class ControlBase:
        c.concat_mask = self.concat_mask
        c.extra_concat_orig = self.extra_concat_orig.copy()
        c.extra_hooks = self.extra_hooks.clone() if self.extra_hooks else None
+        c.preprocess_image = self.preprocess_image

    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@ -194,7 +197,7 @@ class ControlBase:


 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False):
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False, preprocess_image=lambda a: a):
        super().__init__()
        self.control_model = control_model
        self.load_device = load_device
@ -209,6 +212,7 @@ class ControlNet(ControlBase):
        self.extra_conds += extra_conds
        self.strength_type = strength_type
        self.concat_mask = concat_mask
+        self.preprocess_image = preprocess_image

    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
        control_prev = None
@ -237,6 +241,7 @@ class ControlNet(ControlBase):
                if self.latent_format is not None:
                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
+            self.cond_hint = self.preprocess_image(self.cond_hint)
            if self.vae is not None:
                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
                self.cond_hint = self.vae.encode(self.cond_hint.movedim(1, -1))
@ -440,6 +445,7 @@ def controlnet_load_state_dict(control_model, sd):
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
    return control_model

+
 def load_controlnet_mmdit(sd, model_options={}):
    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
@ -461,6 +467,83 @@ def load_controlnet_mmdit(sd, model_options={}):
    return control


+class ControlNetSD35(ControlNet):
+    def pre_run(self, model, percent_to_timestep_function):
+        if self.control_model.double_y_emb:
+            missing, unexpected = self.control_model.orig_y_embedder.load_state_dict(model.diffusion_model.y_embedder.state_dict(), strict=False)
+        else:
+            missing, unexpected = self.control_model.x_embedder.load_state_dict(model.diffusion_model.x_embedder.state_dict(), strict=False)
+        super().pre_run(model, percent_to_timestep_function)
+
+    def copy(self):
+        c = ControlNetSD35(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
+def load_controlnet_sd35(sd, model_options={}):
+    control_type = -1
+    if "control_type" in sd:
+        control_type = round(sd.pop("control_type").item())
+
+    # blur_cnet = control_type == 0
+    canny_cnet = control_type == 1
+    depth_cnet = control_type == 2
+
+    print(control_type, canny_cnet, depth_cnet)
+    new_sd = {}
+    for k in comfy.utils.MMDIT_MAP_BASIC:
+        if k[1] in sd:
+            new_sd[k[0]] = sd.pop(k[1])
+    for k in sd:
+        new_sd[k] = sd[k]
+    sd = new_sd
+
+    y_emb_shape = sd["y_embedder.mlp.0.weight"].shape
+    depth = y_emb_shape[0] // 64
+    hidden_size = 64 * depth
+    num_heads = depth
+    head_dim = hidden_size // num_heads
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'transformer_blocks.{}.')
+
+    load_device = comfy.model_management.get_torch_device()
+    offload_device = comfy.model_management.unet_offload_device()
+    unet_dtype = comfy.model_management.unet_dtype(model_params=-1)
+
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    control_model = comfy.cldm.dit_embedder.ControlNetEmbedder(img_size=None,
+                                                               patch_size=2,
+                                                               in_chans=16,
+                                                               num_layers=num_blocks,
+                                                               main_model_double=depth,
+                                                               double_y_emb=y_emb_shape[0] == y_emb_shape[1],
+                                                               attention_head_dim=head_dim,
+                                                               num_attention_heads=num_heads,
+                                                               adm_in_channels=2048,
+                                                               device=offload_device,
+                                                               dtype=unet_dtype,
+                                                               operations=operations)
+
+    control_model = controlnet_load_state_dict(control_model, sd)
+
+    latent_format = comfy.latent_formats.SD3()
+    preprocess_image = lambda a: a
+    if canny_cnet:
+        preprocess_image = lambda a: (a * 255 * 0.5 + 0.5)
+    elif depth_cnet:
+        preprocess_image = lambda a: 1.0 - a
+
+    control = ControlNetSD35(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, preprocess_image=preprocess_image)
+    return control
+
+
+
 def load_controlnet_hunyuandit(controlnet_data, model_options={}):
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)

@ -573,7 +656,10 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
        elif "pos_embed_input.proj.weight" in controlnet_data:
-            return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
+            if "transformer_blocks.0.adaLN_modulation.1.bias" in controlnet_data:
+                return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
+            else:
+                return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
        elif "controlnet_x_embedder.weight" in controlnet_data:
            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -62,6 +62,7 @@ def load_lora(lora, to_load, log_missing=True):
        diffusers_lora = "{}_lora.up.weight".format(x)
        diffusers2_lora = "{}.lora_B.weight".format(x)
        diffusers3_lora = "{}.lora.up.weight".format(x)
+        mochi_lora = "{}.lora_B".format(x)
        transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
        A_name = None

@ -81,6 +82,10 @@ def load_lora(lora, to_load, log_missing=True):
            A_name = diffusers3_lora
            B_name = "{}.lora.down.weight".format(x)
            mid_name = None
+        elif mochi_lora in lora.keys():
+            A_name = mochi_lora
+            B_name = "{}.lora_A".format(x)
+            mid_name = None
        elif transformers_lora in lora.keys():
            A_name = transformers_lora
            B_name ="{}.lora_linear_layer.down.weight".format(x)
@ -363,6 +368,12 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
                key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer

+    if isinstance(model, comfy.model_base.GenmoMochi):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"): #Official Mochi lora format
+                key_lora = k[len("diffusion_model."):-len(".weight")]
+                key_map["{}".format(key_lora)] = k
+
    return key_map


--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -724,7 +724,13 @@ class Flux(BaseModel):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)

    def concat_cond(self, **kwargs):
-        num_channels = self.diffusion_model.img_in.weight.shape[1] // (self.diffusion_model.patch_size * self.diffusion_model.patch_size)
+        try:
+            #Handle Flux control loras dynamically changing the img_in weight.
+            num_channels = self.diffusion_model.img_in.weight.shape[1] // (self.diffusion_model.patch_size * self.diffusion_model.patch_size)
+        except:
+            #Some cases like tensorrt might not have the weights accessible
+            num_channels = self.model_config.unet_config["in_channels"]
+
        out_channels = self.model_config.unet_config["out_channels"]

        if num_channels <= out_channels:
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -600,6 +600,10 @@ def maximum_vram_for_weights(device=None):
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
    if model_params < 0:
        model_params = 1000000000000000000000
+    if args.fp32_unet:
+        return torch.float32
+    if args.fp64_unet:
+        return torch.float64
    if args.bf16_unet:
        return torch.bfloat16
    if args.fp16_unet:
@ -646,7 +650,7 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor

 # None means no manual cast
 def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
-    if weight_dtype == torch.float32:
+    if weight_dtype == torch.float32 or weight_dtype == torch.float64:
        return None

    fp16_supported = should_use_fp16(inference_device, prioritize_performance=False)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -659,6 +659,15 @@ class Flux(supported_models_base.BASE):
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))

+class FluxInpaint(Flux):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": True,
+        "in_channels": 96,
+    }
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
 class FluxSchnell(Flux):
    unet_config = {
        "image_model": "flux",
@ -731,6 +740,6 @@ class LTXV(supported_models_base.BASE):
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.lt.LTXVT5Tokenizer, comfy.text_encoders.lt.ltxv_te(**t5_detect))

-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, HunyuanDiT, HunyuanDiT1, Flux, FluxSchnell, GenmoMochi, LTXV]
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV]

 models += [SVD_img2vid]