diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 6ee0e1d7..efe9cff7 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -191,3 +191,6 @@ if args.windows_standalone_build: if args.disable_auto_launch: args.auto_launch = False + +if args.force_fp16: + args.fp16_unet = True diff --git a/comfy/clip_model.py b/comfy/clip_model.py index c4857602..cf5b58b6 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -104,7 +104,8 @@ class CLIPTextModel_(torch.nn.Module): mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max) - causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(-torch.finfo(x.dtype).max).triu_(1) + causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1) + if mask is not None: mask += causal_mask else: diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 2c0d1832..45667998 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1267,7 +1267,7 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis return x @torch.no_grad() -def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None, cfg_pp=False): +def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, eta=1., cfg_pp=False): extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler @@ -1289,53 +1289,60 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True) for i in trange(len(sigmas) - 1, disable=disable): - if s_churn > 0: - gamma = min(s_churn / (len(sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.0 - sigma_hat = sigmas[i] * (gamma + 1) - else: - gamma = 0 - sigma_hat = sigmas[i] - - if gamma > 0: - eps = torch.randn_like(x) * s_noise - x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5 - denoised = model(x, sigma_hat * s_in, **extra_args) + denoised = model(x, sigmas[i] * s_in, **extra_args) + sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) if callback is not None: - callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigma_hat, "denoised": denoised}) - if sigmas[i + 1] == 0 or old_denoised is None: + callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised}) + if sigma_down == 0 or old_denoised is None: # Euler method if cfg_pp: - d = to_d(x, sigma_hat, uncond_denoised) - x = denoised + d * sigmas[i + 1] + d = to_d(x, sigmas[i], uncond_denoised) + x = denoised + d * sigma_down else: - d = to_d(x, sigma_hat, denoised) - dt = sigmas[i + 1] - sigma_hat + d = to_d(x, sigmas[i], denoised) + dt = sigma_down - sigmas[i] x = x + d * dt else: # Second order multistep method in https://arxiv.org/pdf/2308.02157 - t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigmas[i + 1]), t_fn(sigmas[i - 1]) + t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigma_down), t_fn(sigmas[i - 1]) h = t_next - t c2 = (t_prev - t) / h phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h) - b1 = torch.nan_to_num(phi1_val - 1.0 / c2 * phi2_val, nan=0.0) - b2 = torch.nan_to_num(1.0 / c2 * phi2_val, nan=0.0) + b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0) + b2 = torch.nan_to_num(phi2_val / c2, nan=0.0) if cfg_pp: x = x + (denoised - uncond_denoised) + x = sigma_fn(h) * x + h * (b1 * uncond_denoised + b2 * old_denoised) + else: + x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised) - x = (sigma_fn(t_next) / sigma_fn(t)) * x + h * (b1 * denoised + b2 * old_denoised) + # Noise addition + if sigmas[i + 1] > 0: + x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up - old_denoised = denoised + if cfg_pp: + old_denoised = uncond_denoised + else: + old_denoised = denoised return x @torch.no_grad() -def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None): - return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=False) +def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None): + return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=False) @torch.no_grad() -def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None): - return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=True) +def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None): + return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=True) + +@torch.no_grad() +def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): + return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=False) + +@torch.no_grad() +def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): + return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True) @torch.no_grad() def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.): diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index b5960ffd..36b67931 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -22,7 +22,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor: def rope(pos: Tensor, dim: int, theta: int) -> Tensor: assert dim % 2 == 0 - if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu(): + if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled(): device = torch.device("cpu") else: device = pos.device diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 3292bd2f..ccd5d2c0 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -6,6 +6,7 @@ from typing import List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F +import comfy.ldm.common_dit from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, RMSNorm from comfy.ldm.modules.attention import optimized_attention_masked @@ -594,6 +595,8 @@ class NextDiT(nn.Module): t = 1.0 - timesteps cap_feats = context cap_mask = attention_mask + bs, c, h, w = x.shape + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) """ Forward pass of NextDiT. t: (N,) tensor of diffusion timesteps @@ -613,7 +616,7 @@ class NextDiT(nn.Module): x = layer(x, mask, freqs_cis, adaln_input) x = self.final_layer(x, adaln_input) - x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor) + x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w] return -x diff --git a/comfy/model_base.py b/comfy/model_base.py index 4d1b83a4..98f462b3 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -166,9 +166,6 @@ class BaseModel(torch.nn.Module): def get_dtype(self): return self.diffusion_model.dtype - def is_adm(self): - return self.adm_channels > 0 - def encode_adm(self, **kwargs): return None diff --git a/comfy/model_management.py b/comfy/model_management.py index 477bb0f5..c280067c 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -266,6 +266,12 @@ if ENABLE_PYTORCH_ATTENTION: torch.backends.cuda.enable_flash_sdp(True) torch.backends.cuda.enable_mem_efficient_sdp(True) +try: + if is_nvidia() and args.fast: + torch.backends.cuda.matmul.allow_fp16_accumulation = True +except: + pass + try: if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5: torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) @@ -281,15 +287,10 @@ elif args.highvram or args.gpu_only: vram_state = VRAMState.HIGH_VRAM FORCE_FP32 = False -FORCE_FP16 = False if args.force_fp32: logging.info("Forcing FP32, if this improves things please report it.") FORCE_FP32 = True -if args.force_fp16: - logging.info("Forcing FP16.") - FORCE_FP16 = True - if lowvram_available: if set_vram_to in (VRAMState.LOW_VRAM, VRAMState.NO_VRAM): vram_state = set_vram_to @@ -1019,6 +1020,13 @@ def is_device_mps(device): def is_device_cuda(device): return is_device_type(device, 'cuda') +def is_directml_enabled(): + global directml_enabled + if directml_enabled: + return True + + return False + def should_use_fp16(device=None, model_params=0, prioritize_performance=True, manual_cast=False): global directml_enabled @@ -1026,7 +1034,7 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma if is_device_cpu(device): return False - if FORCE_FP16: + if args.force_fp16: return True if FORCE_FP32: diff --git a/comfy/samplers.py b/comfy/samplers.py index f4873e3a..beef0b7e 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -879,7 +879,8 @@ class Sampler: KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu", "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", - "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "gradient_estimation"] + "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp", + "gradient_estimation"] class KSAMPLER(Sampler): def __init__(self, sampler_function, extra_options={}, inpaint_options={}): diff --git a/comfy/utils.py b/comfy/utils.py index c901347c..df7057c6 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -58,7 +58,7 @@ def load_torch_file(ckpt, safe_load=False, device=None): if "HeaderTooLarge" in message: raise ValueError("{}\n\nFile path: {}\n\nThe safetensors file is corrupt or invalid. Make sure this is actually a safetensors file and not a ckpt or pt or other filetype.".format(message, ckpt)) if "MetadataIncompleteBuffer" in message: - raise ValueError("{}\n\nFile path: {}\n\nThe safetensors file is incomplete. Check the file size and make sure you have copied/downloaded it correctly.".format(message, ckpt)) + raise ValueError("{}\n\nFile path: {}\n\nThe safetensors file is corrupt/incomplete. Check the file size and make sure you have copied/downloaded it correctly.".format(message, ckpt)) raise e else: if safe_load or ALWAYS_SAFE_LOAD: diff --git a/nodes.py b/nodes.py index 9c9e1ae7..e325fd9e 100644 --- a/nodes.py +++ b/nodes.py @@ -924,7 +924,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 / clip-g / clip-l\nstable_audio: t5\nmochi: t5\ncosmos: old t5 xxl" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 / clip-g / clip-l\nstable_audio: t5\nmochi: t5\ncosmos: old t5 xxl\nlumina2: gemma 2 2B" def load_clip(self, clip_name, type="stable_diffusion", device="default"): if type == "stable_cascade": @@ -1065,10 +1065,10 @@ class StyleModelApply: (txt, keys) = t keys = keys.copy() # even if the strength is 1.0 (i.e, no change), if there's already a mask, we have to add to it - if strength_type == "attn_bias" and strength != 1.0 and "attention_mask" not in keys: + if "attention_mask" in keys or (strength_type == "attn_bias" and strength != 1.0): # math.log raises an error if the argument is zero # torch.log returns -inf, which is what we want - attn_bias = torch.log(torch.Tensor([strength])) + attn_bias = torch.log(torch.Tensor([strength if strength_type == "attn_bias" else 1.0])) # get the size of the mask image mask_ref_size = keys.get("attention_mask_img_shape", (1, 1)) n_ref = mask_ref_size[0] * mask_ref_size[1] diff --git a/server.py b/server.py index 1a79da7e..76a99167 100644 --- a/server.py +++ b/server.py @@ -150,7 +150,8 @@ class PromptServer(): PromptServer.instance = self mimetypes.init() - mimetypes.types_map['.js'] = 'application/javascript; charset=utf-8' + mimetypes.add_type('application/javascript; charset=utf-8', '.js') + mimetypes.add_type('image/webp', '.webp') self.user_manager = UserManager() self.model_file_manager = ModelFileManager()