From cf0b549d4828b6b9f6d98277c36784ab3c79ff6d Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Fri, 28 Feb 2025 02:48:20 -0500
Subject: [PATCH] --fast now takes a number as argument to indicate how fast
 you want it.

The idea is that you can indicate how much quality vs speed you want.

At the moment:

--fast 2 enables fp16 accumulation if your pytorch supports it.
--fast 5 enables fp8 matrix mult on fp8 models and the optimization above.

--fast without a number enables all optimizations.
---
 comfy/cli_args.py         | 2 +-
 comfy/model_management.py | 3 ++-
 comfy/ops.py              | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index a906ff1c0..10c142e67 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -130,7 +130,7 @@ parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha
 
 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")
+parser.add_argument("--fast", metavar="number", type=int, const=99, default=0, nargs="?", help="Enable some untested and potentially quality deteriorating optimizations. You can pass a number from 0 to 10 for a bigger speed vs quality tradeoff. Using --fast with no number means maximum speed. 2 or larger enables fp16 accumulation, 5 or larger enables fp8 matrix multiplication.")
 
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
diff --git a/comfy/model_management.py b/comfy/model_management.py
index afbb133d4..5eb2e5ad6 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -280,9 +280,10 @@ if ENABLE_PYTORCH_ATTENTION:
 
 PRIORITIZE_FP16 = False  # TODO: remove and replace with something that shows exactly which dtype is faster than the other
 try:
-    if is_nvidia() and args.fast:
+    if is_nvidia() and args.fast >= 2:
         torch.backends.cuda.matmul.allow_fp16_accumulation = True
         PRIORITIZE_FP16 = True  # TODO: limit to cards where it actually boosts performance
+        logging.info("Enabled fp16 accumulation.")
 except:
     pass
 
diff --git a/comfy/ops.py b/comfy/ops.py
index 30014477e..905ea90f6 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -360,7 +360,7 @@ def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_
     if scaled_fp8 is not None:
         return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)
 
-    if fp8_compute and (fp8_optimizations or args.fast) and not disable_fast_fp8:
+    if fp8_compute and (fp8_optimizations or args.fast >= 5) and not disable_fast_fp8:
         return fp8_ops
 
     if compute_dtype is None or weight_dtype == compute_dtype: